In [None]:
# Standard imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# Our modules
from src.fx_data_loader import (
    FXDataLoader, 
    FXDataMetadata, 
    FXDataIntegrityError,
    create_sample_fx_data
)

# Show all warnings
warnings.filterwarnings('always')

print("‚úÖ Imports successful")
print(f"Project root: {project_root}")

## 1. Generate Sample Data (For Testing Only)

‚ö†Ô∏è **THIS IS SYNTHETIC DATA** - Replace with real data for actual analysis.

For real data, download from:
- **HistData.com** (free, bid prices, 1-min bars)
- **Dukascopy** (tick data, requires processing)
- **TrueFX** (free tick data, requires registration)

In [None]:
# Generate sample data for testing the pipeline
# DELETE THIS CELL when using real data

pairs = ["EURUSD", "GBPUSD", "USDJPY"]
data_dir = project_root / "data" / "raw" / "fx"

for pair in pairs:
    filepath = data_dir / f"{pair}.csv"
    if not filepath.exists():
        create_sample_fx_data(
            pair=pair,
            output_dir=str(data_dir),
            days=30,
            bar_interval_minutes=1
        )
    else:
        print(f"{pair}.csv already exists")

## 2. Load and Verify Data Integrity

The loader will:
1. Check for required columns
2. Enforce UTC timezone
3. Detect gaps, duplicates, extreme moves
4. Surface all data quality issues

**If integrity checks fail ‚Üí STOP. Do not proceed.**

In [None]:
# Initialize loader with explicit metadata
# CHANGE THIS when using real data from different source

metadata = FXDataMetadata(
    source="Synthetic (for testing only)",
    broker=None,  # Not broker-specific
    price_type="bid",  # Assuming bid prices
    timezone="UTC",
    volume_type="tick",  # Tick volume, NOT real volume
    spread_available=False,  # No bid/ask spread data
    bar_interval="1min",
    dst_handling="utc_native",  # UTC doesn't have DST
)

loader = FXDataLoader(
    data_dir=str(data_dir),
    metadata=metadata
)

print("Loader initialized with metadata:")
print(f"  Source: {metadata.source}")
print(f"  Price Type: {metadata.price_type}")
print(f"  Timezone: {metadata.timezone}")
print(f"  Volume: {metadata.volume_type}")

In [None]:
# Load each pair and run integrity checks
fx_data = {}

for pair in pairs:
    print(f"\n{'='*60}")
    print(f"Loading {pair}...")
    print(f"{'='*60}")
    
    try:
        df = loader.load_csv(pair, verify=True, fail_on_warning=False)
        fx_data[pair] = df
        print(f"‚úÖ Loaded {len(df):,} bars")
    except FXDataIntegrityError as e:
        print(f"‚ùå INTEGRITY FAILURE: {e}")
    except FileNotFoundError as e:
        print(f"‚ùå FILE NOT FOUND: {e}")

In [None]:
# Print detailed integrity reports
for pair in fx_data.keys():
    loader.print_integrity_report(pair)
    print("\n")

## 3. Visual Inspection

Charts for:
- Price overview (spot obvious issues)
- Gap analysis
- Session distribution
- Return distribution (fat tails?)

In [None]:
def plot_fx_overview(df: pd.DataFrame, pair: str):
    """Plot FX data overview for visual inspection."""
    fig, axes = plt.subplots(3, 2, figsize=(14, 10))
    fig.suptitle(f"{pair} Data Overview - Visual Inspection", fontsize=14)
    
    # 1. Price chart
    ax = axes[0, 0]
    ax.plot(df.index, df['close'], linewidth=0.5, alpha=0.8)
    ax.set_title('Close Price')
    ax.set_xlabel('Time')
    ax.set_ylabel('Price')
    ax.grid(True, alpha=0.3)
    
    # 2. Returns distribution
    ax = axes[0, 1]
    returns = df['close'].pct_change().dropna()
    ax.hist(returns, bins=100, density=True, alpha=0.7, edgecolor='black')
    ax.axvline(x=0, color='red', linestyle='--', alpha=0.5)
    ax.set_title(f'Returns Distribution (std={returns.std()*100:.3f}%)')
    ax.set_xlabel('Return')
    ax.set_ylabel('Density')
    
    # 3. Time gaps histogram
    ax = axes[1, 0]
    diffs = df.index.to_series().diff().dropna()
    diffs_minutes = diffs.dt.total_seconds() / 60
    # Filter to show gaps up to 60 minutes
    diffs_filtered = diffs_minutes[diffs_minutes <= 60]
    ax.hist(diffs_filtered, bins=50, edgecolor='black', alpha=0.7)
    ax.axvline(x=1, color='red', linestyle='--', label='Expected (1 min)')
    ax.set_title('Time Gap Distribution (‚â§60 min)')
    ax.set_xlabel('Gap (minutes)')
    ax.set_ylabel('Count')
    ax.legend()
    
    # 4. Session distribution
    ax = axes[1, 1]
    session_counts = df['primary_session'].value_counts()
    colors = {'london': 'blue', 'new_york': 'green', 'tokyo': 'red', 
              'sydney': 'orange', 'off_hours': 'gray'}
    bar_colors = [colors.get(s, 'gray') for s in session_counts.index]
    session_counts.plot(kind='bar', ax=ax, color=bar_colors, edgecolor='black')
    ax.set_title('Bars by Session')
    ax.set_xlabel('Session')
    ax.set_ylabel('Bar Count')
    ax.tick_params(axis='x', rotation=45)
    
    # 5. Volume by hour (if available)
    ax = axes[2, 0]
    if 'volume' in df.columns:
        hourly_vol = df.groupby(df.index.hour)['volume'].mean()
        hourly_vol.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
        ax.set_title('Average Tick Volume by Hour (UTC)')
        ax.set_xlabel('Hour (UTC)')
        ax.set_ylabel('Avg Tick Volume')
    else:
        ax.text(0.5, 0.5, 'No volume data', ha='center', va='center', fontsize=12)
        ax.set_title('Volume: N/A')
    
    # 6. Candle range distribution
    ax = axes[2, 1]
    ranges = df['high'] - df['low']
    if 'JPY' in pair:
        ranges_pips = ranges * 100
    else:
        ranges_pips = ranges * 10000
    ax.hist(ranges_pips, bins=50, edgecolor='black', alpha=0.7)
    ax.axvline(x=ranges_pips.median(), color='red', linestyle='--', 
               label=f'Median: {ranges_pips.median():.1f} pips')
    ax.set_title('Candle Range Distribution')
    ax.set_xlabel('Range (pips)')
    ax.set_ylabel('Count')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# Plot overview for each pair
for pair, df in fx_data.items():
    plot_fx_overview(df, pair)

## 4. Gap Analysis

Identify and categorize all gaps:
- **Weekend gaps**: Expected (Fri 21:00 - Sun 21:00 UTC)
- **Session gaps**: Lower liquidity periods
- **Suspicious gaps**: Possible missing data

In [None]:
def analyze_gaps(df: pd.DataFrame, pair: str, expected_bar_minutes: int = 1):
    """Analyze and categorize time gaps in FX data."""
    
    diffs = df.index.to_series().diff()
    expected = pd.Timedelta(minutes=expected_bar_minutes)
    
    # Categorize gaps
    gaps = pd.DataFrame({
        'gap': diffs,
        'gap_minutes': diffs.dt.total_seconds() / 60
    }).dropna()
    
    # Weekend gaps (>24 hours)
    weekend_gaps = gaps[gaps['gap'] > pd.Timedelta(hours=24)]
    
    # Suspicious gaps (5 min - 24 hours)
    suspicious_gaps = gaps[
        (gaps['gap'] > expected * 5) & 
        (gaps['gap'] <= pd.Timedelta(hours=24))
    ]
    
    # Minor gaps (2-5x expected)
    minor_gaps = gaps[
        (gaps['gap'] > expected * 2) & 
        (gaps['gap'] <= expected * 5)
    ]
    
    print(f"\nüìä GAP ANALYSIS: {pair}")
    print(f"="*50)
    print(f"Total bars: {len(df):,}")
    print(f"Expected gap: {expected_bar_minutes} minute(s)")
    print(f"\nGap Categories:")
    print(f"  Weekend gaps (>24h): {len(weekend_gaps)}")
    print(f"  Suspicious gaps (5min-24h): {len(suspicious_gaps)}")
    print(f"  Minor gaps (2-5x): {len(minor_gaps)}")
    print(f"  Normal gaps: {len(gaps) - len(weekend_gaps) - len(suspicious_gaps) - len(minor_gaps)}")
    
    if len(suspicious_gaps) > 0:
        print(f"\n‚ö†Ô∏è SUSPICIOUS GAPS (investigate these):")
        for idx, row in suspicious_gaps.head(10).iterrows():
            print(f"  {idx}: {row['gap_minutes']:.0f} minutes")
    
    return {
        'weekend': weekend_gaps,
        'suspicious': suspicious_gaps,
        'minor': minor_gaps
    }

In [None]:
# Analyze gaps for each pair
gap_analysis = {}
for pair, df in fx_data.items():
    gap_analysis[pair] = analyze_gaps(df, pair)

## 5. Data Summary Export

Save integrity reports and summaries for documentation.

In [None]:
# Save integrity reports
results_dir = project_root / "results" / "metrics"

for pair in fx_data.keys():
    loader.save_integrity_report(pair, output_dir=str(results_dir))
    
    # Also get and print summary
    summary = loader.get_data_summary(fx_data[pair], pair)
    print(f"\n{pair} Summary:")
    print(f"  Total bars: {summary['total_bars']:,}")
    print(f"  Trading days: {summary['date_range']['trading_days']}")
    print(f"  London bars: {summary['sessions']['london_bars']:,}")
    print(f"  NY bars: {summary['sessions']['new_york_bars']:,}")
    print(f"  Overlap bars: {summary['sessions']['overlap_bars']:,}")

---

## ‚ùì MANDATORY VERIFICATION QUESTIONS

**You MUST answer these before proceeding to any analysis.**

### 1. What is the data source and broker?
- **Source**: [Fill in - e.g., HistData.com, Dukascopy, OANDA export]
- **Broker**: [Fill in - if broker-specific data]

### 2. Is this bid, ask, or mid?
- **Answer**: [Fill in]
- **Implication**: [e.g., "Bid prices - buys need ask estimate"]

### 3. What timezone is the data in?
- **Answer**: [Should be UTC]
- **Verified by**: [How did you verify?]

### 4. How is DST handled?
- **Answer**: [UTC-native / Local-shifted / Unknown]

### 5. Is volume real or tick volume?
- **Answer**: [Tick / Real / None]
- **Implication**: [What does this mean for your analysis?]

### 6. Are spreads available?
- **Answer**: [Yes / No]
- **If No**: [How will you model spreads later?]

### 7. What FX-specific biases exist?
- **Rollover**: [Acknowledged / Not applicable]
- **Session gaps**: [Acknowledged / Not applicable]
- **Broker filtering**: [Acknowledged / Not applicable]

### 8. Data integrity verdict
- **PASS / FAIL**: [Based on integrity checks]
- **Proceed to Stage 3?**: [Yes / No]

---

‚ö†Ô∏è **If any answer is vague ‚Üí REDO STAGE 2**

In [None]:
# Final execution realism warning
print("\n" + "="*60)
print("EXECUTION REALISM WARNING")
print("="*60)
print(loader.metadata.get_execution_realism_warning())
print("\n" + "="*60)
print("If you cannot explain exactly how this FX data lies to you,")
print("you are NOT ALLOWED to model it.")
print("="*60)