In [None]:
import pandas as pd
import talib
import numpy as np

# Load the data
instrument = "EUR_USD_M15"
# instrument = "GBP_USD_M15"
df = pd.read_csv(f'/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/{instrument}_raw_data.csv', 
                 parse_dates=["time"])
df.set_index("time", inplace=True)

# Add time-based features
df["year"] = df.index.year
df["month"] = df.index.month
df["day_of_week"] = df.index.dayofweek
df["hour"] = df.index.hour
df["minute"] = df.index.minute
# Calculate ATR
df["atr"] = talib.ATR(df["high"], df["low"], df["close"], timeperiod=1)
# df["log_close"] = np.log(df["close"])
# df["log_high"] = np.log(df["high"])
# df["log_low"] = np.log(df["low"])
# df["log_atr"] = np.log(df["atr"])

# Remove NaN values
df = df.dropna()

# Extract only year 2018
df = df[df["year"] == 2018]

# Calculate Q1 and Q3
Q1 = round(df['atr'].quantile(0.4), 6 )

Q3 = round(df['atr'].quantile(0.9), 6)


# Create the new 'atr_cleaned' column
df['atr_cleaned'] = df['atr'].clip(lower=Q1, upper=Q3)

df

In [None]:
# Print some statistics
print(f"Original ATR range: {df['atr'].min()} to {df['atr'].max()}")
print(f"Cleaned ATR range: {df['atr_cleaned'].min()} to {df['atr_cleaned'].max()}")
print(f"Q1: {Q1}")
print(f"Q3: {Q3}")

# Optional: Plot the results
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['atr'], label='Original ATR', alpha=0.5)
plt.plot(df.index, df['atr_cleaned'], label='Cleaned ATR', alpha=0.8)
plt.title('ATR - Original vs Cleaned (Clipped to Q1-Q3 range)')
plt.legend()
plt.show()

In [None]:
# # Load your actual data here
# df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv', parse_dates=["time"])
# df.set_index("time", inplace=True)
# df = df[df["year"] == 2018]


window_splitter = OrderedSlidingWindowSplitter(train_weeks=4, test_weeks=2, step_size=1)

print(f"Total number of splits: {window_splitter.get_n_splits(df)}")

for window, (train_indices, test_indices) in enumerate(window_splitter.split(df), 1):
    train_data = df.iloc[train_indices]
    test_data = df.iloc[test_indices]
    
    print(f"\nWindow {window}:")
    print(f"Train data: {train_data.index[0]} to {train_data.index[-1]}")
    print(f"Train data shape: {train_data.shape}")
    print(f"Test data: {test_data.index[0]} to {test_data.index[-1]}")
    print(f"Test data shape: {test_data.shape}")
    print(f"Expected train points: {4 * 5 * 24 * 4}, Actual: {len(train_indices)}")
    print(f"Expected test points: {2 * 5 * 24 * 4}, Actual: {len(test_indices)}")
    
    # if window == window_splitter.get_n_splits(df):
    #     break

print("\nAll windows processed.")
