In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from pygam import LinearGAM, s, te
from sklearn.metrics import mean_squared_error



  


In [None]:
models = {
  "Linear Regression": LinearRegression(),
  # "Ridge": Ridge(alpha=0.1),
  # "Lasso": Lasso(alpha=0.1),
  "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
  "Polynomial Regression (Degree 3)": make_pipeline(PolynomialFeatures(degree=3), LinearRegression()),
  # "GAM (Linear Splines)": LinearGAM(s(0, spline_order=1)),  # Linear splines
  # "GAM (Cubic Splines)": LinearGAM(s(0, spline_order=3)),  # Cubic splines
  # "GAM (Tensor Splines)": LinearGAM(te(0, 1)),  # Tensor splines for first two features
  # "Smoothing Splines": LinearGAM(s(0)).gridsearch,  # Automatic smoothing
}

data = pd.read_csv('financial_regression_cleaned.csv')

data['date'] = pd.to_datetime(data['date'])



data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_year'] = data['date'].dt.dayofyear

# Encode cyclical features 
data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

# Drop the original datetime column
data = data.drop(columns=['date'])
data = data.drop(data.columns[0], axis=1)  # Drop the first column (index 0)


data.dropna(subset=['usd_chf', 'eur_usd'], inplace=True)

print(data[['usd_chf', 'eur_usd']].isnull().sum())



print(list(data.columns))

data



usd_chf    0
eur_usd    0
dtype: int64
['sp500 open', 'sp500 high', 'sp500 low', 'sp500 close', 'sp500 volume', 'sp500 high-low', 'nasdaq open', 'nasdaq high', 'nasdaq low', 'nasdaq close', 'nasdaq volume', 'nasdaq high-low', 'usd_chf', 'eur_usd', 'silver open', 'silver high', 'silver low', 'silver close', 'silver volume', 'silver high-low', 'oil open', 'oil high', 'oil low', 'oil close', 'oil volume', 'oil high-low', 'platinum open', 'platinum high', 'platinum low', 'platinum close', 'platinum volume', 'platinum high-low', 'palladium open', 'palladium high', 'palladium low', 'palladium close', 'palladium volume', 'palladium high-low', 'gold open', 'gold high', 'gold low', 'gold close', 'gold volume', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'month_sin', 'month_cos']


Unnamed: 0,sp500 open,sp500 high,sp500 low,sp500 close,sp500 volume,sp500 high-low,nasdaq open,nasdaq high,nasdaq low,nasdaq close,...,gold low,gold close,gold volume,year,month,day,day_of_week,day_of_year,month_sin,month_cos
0,114.49,115.14,114.420,114.93,115646960.0,0.720,46.26,46.520,46.2200,46.39,...,110.79,112.03,18305238.0,2010,1,14,3,14,0.500000,0.866025
1,114.73,114.84,113.200,113.64,212252769.0,1.640,46.46,46.550,45.6500,45.85,...,110.38,110.86,18000724.0,2010,1,15,4,15,0.500000,0.866025
2,113.62,115.13,113.590,115.06,138671890.0,1.540,45.96,46.640,45.9500,46.59,...,110.83,111.52,10467927.0,2010,1,19,1,19,0.500000,0.866025
3,114.28,114.45,112.980,113.89,216330645.0,1.470,46.27,46.604,45.4300,45.92,...,108.46,108.94,17534231.0,2010,1,20,2,20,0.500000,0.866025
4,113.92,114.27,111.560,111.70,344747028.0,2.710,46.06,46.350,45.3000,45.49,...,106.61,107.37,25747831.0,2010,1,21,3,21,0.500000,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3710,576.05,580.33,575.910,579.58,42267994.0,4.420,490.74,494.390,490.1700,493.36,...,244.47,245.47,5789546.0,2024,10,11,4,285,-0.866025,0.500000
3712,584.59,584.90,578.545,579.78,54203636.0,6.355,497.83,498.500,488.6800,490.85,...,244.53,245.92,5640831.0,2024,10,15,1,289,-0.866025,0.500000
3713,579.78,582.83,578.960,582.30,30725436.0,3.870,491.18,491.690,487.5700,490.91,...,246.36,247.15,5431939.0,2024,10,16,2,290,-0.866025,0.500000
3714,585.91,586.12,582.160,582.35,34393714.0,3.960,496.44,496.490,491.1901,491.25,...,247.62,248.63,5176170.0,2024,10,17,3,291,-0.866025,0.500000


In [None]:

def model_selector(data, prediction):

  X = data.drop(columns=[prediction])  # Predictors
  y = data[prediction]  # Target variable

  # Split the data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  


  best_models = {}
  for model_name, model in models.items():
    print(f"\nEvaluating model: {model_name}")
      
    remaining_features = list(X.columns)
    selected_features = []
    best_score = float('inf')

    predictors = {} 
    while remaining_features:
      scores = {}
      for feature in remaining_features:
        features_to_test = selected_features + [feature]
          
        # Special case for GAMs and Smoothing Splines
        if "GAM" in model_name or "Smoothing" in model_name:
          if "Tensor" in model_name and len(features_to_test) >= 2:
            gam_model = LinearGAM(te(features_to_test[0], features_to_test[1])).gridsearch(
              X_train[features_to_test].values, y_train.values
            )
          else:
            gam_model = LinearGAM(s(0)).gridsearch(
              X_train[features_to_test].values, y_train.values
            )
          score = -cross_val_score(
            gam_model, 
            X_train[features_to_test].values, 
            y_train.values, 
            scoring='neg_mean_squared_error', 
            cv=5
          ).mean()
        else:
          score = -cross_val_score(
            model, 
            X_train[features_to_test], 
            y_train, 
            scoring='neg_mean_squared_error', 
            cv=5
          ).mean()
        
        scores[feature] = score
    
      # Select the best feature for this iteration
      new_feature = min(scores, key=scores.get)
      new_score = scores[new_feature]
    
      best_score = new_score
      selected_features.append(new_feature)
      remaining_features.remove(new_feature)

      predictors[tuple(selected_features)] = best_score

        
      #print(f"  Selected feature: {new_feature}, CV Score: {best_score}")

    

    selected_features = list(min(predictors, key = predictors.get))
      
    # Train the final model with selected features
    if "GAM" in model_name or "Smoothing" in model_name:
      if "Tensor" in model_name and len(selected_features) >= 2:
        final_model = LinearGAM(te(selected_features[0], selected_features[1])).gridsearch(
          X_train[selected_features].values, y_train.values
        )
      else:
        final_model = LinearGAM(s(0)).gridsearch(
          X_train[selected_features].values, y_train.values
        )
    else:
      final_model = model
      final_model.fit(X_train[selected_features], y_train)
    
    # Evaluate the model on the test set
    y_pred = final_model.predict(X_test[selected_features])
    test_score = mean_squared_error(y_test, y_pred)
    print(f"  Test MSE for {model_name}: {test_score} with selected features: {selected_features}")
    
    
    best_models[model_name] = {
      "model": final_model,
      "selected_features": selected_features,
      "test_score": test_score
    }

  # Print summary of best-performing models
  print("\nSummary of Best Models:")
  for model_name, details in best_models.items():
    print(f"{model_name} - Test MSE: {details['test_score']}, Features: {details['selected_features']}")


  
  

In [90]:
targets = ["sp500 open","sp500 high","sp500 low","sp500 close","sp500 volume","sp500 high-low","nasdaq open","nasdaq high","nasdaq low","nasdaq close","nasdaq volume","nasdaq high-low","silver open","silver high","silver low","silver close","silver volume","silver high-low","oil open","oil high","oil low","oil close","oil volume","oil high-low","platinum open","platinum high","platinum low","platinum close","platinum volume","platinum high-low","palladium open","palladium high","palladium low","palladium close","palladium volume","palladium high-low","gold open","gold high","gold low","gold close","gold volume"]

for target in targets:
  print(f"Best Model for predicting {target} is ")
  
  model_selector(data, target)

  print("\n")

Best Model for predicting sp500 open is 

Evaluating model: Linear Regression
  Test MSE for Linear Regression: 0.38753569238131563 with selected features: ['sp500 high', 'sp500 high-low', 'sp500 close', 'nasdaq volume', 'palladium high-low', 'oil close', 'oil open', 'oil high', 'oil low', 'platinum high-low', 'sp500 volume', 'nasdaq open', 'nasdaq high', 'nasdaq low', 'month_cos', 'day', 'oil high-low', 'nasdaq high-low', 'sp500 low', 'palladium volume', 'usd_chf', 'day_of_week', 'month', 'month_sin', 'silver high-low', 'platinum volume', 'gold high', 'platinum close', 'platinum open', 'platinum high']

Summary of Best Models:
Linear Regression - Test MSE: 0.38753569238131563, Features: ['sp500 high', 'sp500 high-low', 'sp500 close', 'nasdaq volume', 'palladium high-low', 'oil close', 'oil open', 'oil high', 'oil low', 'platinum high-low', 'sp500 volume', 'nasdaq open', 'nasdaq high', 'nasdaq low', 'month_cos', 'day', 'oil high-low', 'nasdaq high-low', 'sp500 low', 'palladium volume',

KeyboardInterrupt: 