In [1]:
import pandas as pd
import numpy as np
import glob

# Load all call options CSVs from /data/raw
files = glob.glob("../data/raw/*_calls_*.csv")

df_list = [pd.read_csv(file) for file in files]
df = pd.concat(df_list, ignore_index=True)
print(f"✅ Combined {len(files)} files | {df.shape[0]} total rows")
df.head()

✅ Combined 1 files | 231 total rows


Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency,expiration,retrieved,ticker
0,AAPL250620C00005000,2025-06-13 18:58:54+00:00,5.0,191.26,190.8,191.85,-2.75,-1.417453,10.0,826,14.640626,True,REGULAR,USD,2025-06-20,2025-06-15 14:32:54.038345,AAPL
1,AAPL250620C00010000,2025-06-09 17:59:26+00:00,10.0,192.64,185.4,187.55,0.0,0.0,40.0,51,8.125005,True,REGULAR,USD,2025-06-20,2025-06-15 14:32:54.038345,AAPL
2,AAPL250620C00015000,2025-06-13 13:47:55+00:00,15.0,182.12,180.35,182.65,-8.480011,-4.449114,2.0,5,7.437501,True,REGULAR,USD,2025-06-20,2025-06-15 14:32:54.038345,AAPL
3,AAPL250620C00020000,2025-06-03 13:50:53+00:00,20.0,183.0,175.4,177.1,0.0,0.0,1.0,6,9.203129,True,REGULAR,USD,2025-06-20,2025-06-15 14:32:54.038345,AAPL
4,AAPL250620C00025000,2025-06-13 18:58:54+00:00,25.0,171.33,170.35,172.15,-5.550003,-3.137722,4.0,4,8.363286,True,REGULAR,USD,2025-06-20,2025-06-15 14:32:54.038345,AAPL


In [2]:
# Moneyness = strike / last price
df["moneyness"] = df["strike"] / df["lastPrice"]

# Convert to datetime
df["expiration"] = pd.to_datetime(df["expiration"])
df["retrieved"] = pd.to_datetime(df["retrieved"])

# Days to expiration
df["days_to_exp"] = (df["expiration"] - df["retrieved"]).dt.days

In [3]:
df = df[df["volume"] > 0]
df = df[df["impliedVolatility"] > 0]
df = df[df["days_to_exp"] > 0]
df.dropna(inplace=True)

print("✅ Filtered shape:", df.shape)

✅ Filtered dataset: (229, 19)


In [4]:
# Sort to simulate next-row return
df = df.sort_values(by=["retrieved", "ticker"])

df["future_price"] = df["lastPrice"].shift(-1)
df["return_1d"] = (df["future_price"] - df["lastPrice"]) / df["lastPrice"]
df["target"] = np.where(df["return_1d"] > 0.02, 1, 0)

df.dropna(inplace=True)

print("✅ Final shape after target creation:", df.shape)
df[["lastPrice", "future_price", "return_1d", "target"]].head()

✅ Target created | Sample:


Unnamed: 0,lastPrice,future_price,return_1d,target


In [5]:
df.to_csv("../data/processed/combined_options_clean.csv", index=False)
print("✅ Clean dataset saved to data/processed/combined_options_clean.csv")

✅ Cleaned dataset saved to data/processed/combined_options_clean.csv
