# Forex Data Processor: Resample & Split (Train 2000-2023, Test 2024-2025)

This notebook processes EUR/USD data.
1.  **Load Data**: Uses existing 1-minute data (high quality) and attempts to fetch recent data.
2.  **Resample**: Creates 5m, 15m, 30m, 1h, 4h datasets.
3.  **Split**:
    *   **Train**: 2000 - 2023 (Using available history)
    *   **Test**: 2024 - 2025
4.  **Save**: Exports to `data/train` and `data/test`.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Paths
BASE_DIR = Path(r'c:\Users\Acer\Desktop\Forex-Signal-App')
DATA_DIR = BASE_DIR / 'data'
TRAIN_DIR = DATA_DIR / 'train'
TEST_DIR = DATA_DIR / 'test'

TRAIN_DIR.mkdir(parents=True, exist_ok=True)
TEST_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data Directory: {DATA_DIR}")
print(f"Train Directory: {TRAIN_DIR}")
print(f"Test Directory: {TEST_DIR}")

In [None]:
# 1. Load Existing 1-Minute Data
# We use the high-quality 1m data we already have as the base.
# Note: Free APIs like Yahoo Finance do NOT provide 1m data back to 2000.
# We will use what we have and split it accordingly.

csv_path = DATA_DIR / 'EUR_USD_1min.csv'
if not csv_path.exists():
    print("Error: EUR_USD_1min.csv not found!")
else:
    print("Loading 1-minute data...")
    df = pd.read_csv(csv_path)
    
    # Standardize columns
    df.columns = df.columns.str.lower()
    if 'timestamp' in df.columns:
        df.rename(columns={'timestamp': 'time'}, inplace=True)
    
    # Parse datetime
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df.sort_index(inplace=True)
    
    print(f"Loaded {len(df):,} rows.")
    print(f"Range: {df.index.min()} to {df.index.max()}")
    
    # Check if we need to fetch more data (e.g. 2025)
    last_date = df.index.max()
    print(f"Last data point: {last_date}")

In [None]:
# 2. Resample Function
def resample_data(df, timeframe):
    """
    Resample 1-min data to higher timeframes.
    """
    agg_dict = {
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last'
    }
    
    # Check for volume columns dynamically
    if 'tick_volume' in df.columns:
        agg_dict['tick_volume'] = 'sum'
    elif 'volume' in df.columns:
        agg_dict['volume'] = 'sum'
        
    # Handle spread if it exists
    if 'spread' in df.columns:
        agg_dict['spread'] = 'mean'
        
    resampled = df.resample(timeframe).agg(agg_dict).dropna()
    return resampled

# Generate Datasets
timeframes = {
    '1min': df,
    '5min': resample_data(df, '5min'),
    '15min': resample_data(df, '15min'),
    '30min': resample_data(df, '30min'),
    '1h': resample_data(df, '1h'),
    '4h': resample_data(df, '4h')
}

for tf, data in timeframes.items():
    print(f"{tf}: {len(data):,} candles")

In [None]:
# 3. Split Train (2000-2023) and Test (2024-2025)
# Note: Our 1m data starts from 2019, so Train will be 2019-2023.
# This is the best possible with high-frequency data.

split_date = '2024-01-01'

for tf, data in timeframes.items():
    print(f"Processing {tf}...")
    
    # Train: Up to 2023-12-31
    train_df = data[data.index < split_date]
    
    # Test: 2024-01-01 onwards
    test_df = data[data.index >= split_date]
    
    print(f"  Train: {len(train_df):,} rows ({train_df.index.min()} to {train_df.index.max()})")
    print(f"  Test:  {len(test_df):,} rows ({test_df.index.min()} to {test_df.index.max()})")
    
    # Save to CSV
    train_path = TRAIN_DIR / f'EUR_USD_{tf}_train.csv'
    test_path = TEST_DIR / f'EUR_USD_{tf}_test.csv'
    
    train_df.to_csv(train_path)
    test_df.to_csv(test_path)
    print(f"  Saved to {train_path} and {test_path}")

print("\nâœ… Data processing complete!")