In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint, uniform
from xgboost import XGBRegressor

from src.data_handler import fetch_option_data

### Preprocessing Data

In [None]:
df = fetch_option_data(ticker="TSLA", opt_type="call")
df = df.assign(option_type="call").reset_index()

# Feature groups
num_features = ['strike', 'days_to_maturity', 'impliedVolatility']
cat_features = ['option_type']

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(with_mean=True, with_std=True), num_features),
    ('cat', OneHotEncoder(), cat_features)
])

try:
    X = df[num_features + cat_features]
    y = df['lastPrice']
except Exception as e:
    print(f"Error: {e}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=13)

### Linear Model
---

In [None]:
linreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

linreg_pipeline.fit(X_train, y_train)


In [None]:
y_pred = linreg_pipeline.predict(X_test)

linreg_mae = mean_absolute_error(y_test, y_pred)
linreg_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
linreg_r2 = r2_score(y_test, y_pred)

print("MAE:", linreg_mae)
print("RMSE:", linreg_rmse)
print("R²:", linreg_r2)
print(f"Relative error from actual price: {(linreg_mae / df["lastPrice"].mean()) * 100:.2f}%")

### Non Linear Models
---

In [None]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        objective    = "reg:squarederror",
        tree_method  = "hist",   # or "gpu_hist" if you have a GPU
        random_state = 42,
        n_jobs       = -1        # use all CPU cores
    ))
])

#### Grid Search Random Forrest

In [None]:
param_grid = {
    "regressor__n_estimators":     [200, 400, 800],          # more trees ⇒ lower variance
    "regressor__max_depth":       [None, 5, 10, 20],         # None = grow fully
    "regressor__min_samples_split":[2, 5, 10],               # node-split granularity
    "regressor__min_samples_leaf":[1, 2, 4],                 # leaf size regularisation
    "regressor__max_features":    ["sqrt", "log2", 0.8]      # feature subsampling
}

tscv = TimeSeriesSplit(n_splits=5)

rf_grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=tscv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=0,
    refit=True
)

rf_grid.fit(X_train, y_train)

print("Best parameters:\n", rf_grid.best_params_)
print("CV-MAE (best):  ", -rf_grid.best_score_)

best_rf_pipeline = rf_grid.best_estimator_


In [None]:
y_pred_test = best_rf_pipeline.predict(X_test)

rf_mae  = mean_absolute_error(y_test, y_pred_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
rf_r2   = r2_score(y_test, y_pred_test)

print("Random-Forest Test-Set Performance")
print(f"  MAE  : {rf_mae:.2f}")
print(f"  RMSE : {rf_rmse:.2f}")
print(f"  R²   : {rf_r2:.3f}")
print(f"Relative error from actual price: {(rf_mae / df["lastPrice"].mean()) * 100:.2f}%")

_#### Grid Search XGBoost

In [None]:
param_dist = {
    # ensemble size / learning dynamics
    "regressor__n_estimators" : randint(300, 1200),
    "regressor__learning_rate" : uniform(0.01, 0.19),
    # tree complexity
    "regressor__max_depth" : randint(3, 9),
    "regressor__min_child_weight" : uniform(0.5, 9.5),
    # regularisation
    "regressor__subsample" : uniform(0.5, 1.0),
    "regressor__colsample_bytree" : uniform(0.5, 1.0),
    "regressor__gamma" : uniform(0, 5),
    "regressor__reg_alpha" : uniform(0, 1),
    "regressor__reg_lambda" : uniform(1, 4),
}

cv_ts = TimeSeriesSplit(n_splits=5)

xgb_search = RandomizedSearchCV(
    estimator = xgb_pipeline,
    param_distributions = param_dist,
    n_iter=64,
    scoring="neg_mean_absolute_error",
    cv=cv_ts,
    verbose=0,
    n_jobs=-3,
    refit=True,
    random_state = 42
)

xgb_search.fit(X_train, y_train)

print("Best hyper-parameters:\n", xgb_search.best_params_)
print("CV-MAE (best):", -xgb_search.best_score_)

best_xgb_pipeline = xgb_search.best_estimator_


In [None]:
y_pred_test_xgb = best_xgb_pipeline.predict(X_test)


xgb_mae = mean_absolute_error(y_test, y_pred_test_xgb)
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
xgb_r2 = r2_score(y_test, y_pred_test_xgb)

print("XGBoost Test-Set Performance")
print(f"MAE: {xgb_mae:.2f}")
print(f"RMSE: {xgb_rmse:.2f}")
print(f"R²: {xgb_r2:.3f}")
print(f"Relative error from actual price: {(xgb_mae / df["lastPrice"].mean()) * 100:.2f}%")