# **S&P 500 Financial Forecasting with XgBoost**

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb

## Pre-Processing

In [33]:
import pandas as pd
import numpy as np

df = pd.read_csv("/kaggle/input/sandp500/all_stocks_5yr.csv")

df.head(3)

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL


In [34]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'Name'], dtype='object')

In [35]:
df['date'] = pd.to_datetime(df['date'])

In [36]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 619029 entries, 0 to 619039
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    619029 non-null  datetime64[ns]
 1   open    619029 non-null  float64       
 2   high    619029 non-null  float64       
 3   low     619029 non-null  float64       
 4   close   619029 non-null  float64       
 5   volume  619029 non-null  int64         
 6   Name    619029 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 37.8+ MB
None
        date   open   high    low  close    volume Name
0 2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
1 2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2 2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
3 2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
4 2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL


## Feature Engineering

### Goal:
* Create more features from the exsisitng features within this data so we have more relevant data the model can learn from

In [37]:
# Creation of Additional Features from existing features
df['price_change'] = df['close'] - df['open']
df['high_low_range'] = df['high'] - df['low']
df['volume_change'] = df['volume'].pct_change()

# Setting up Moving Averages (MA's)
df['ma5'] = df['close'].rolling(window=5).mean()
df['ma10'] = df['close'].rolling(window=10).mean()
df['ma20'] = df['close'].rolling(window=20).mean()

# Shifted features for superivsed learnning (e.g. predicting tomorrows close)
df['close_shifted'] = df['close'].shift(-1)

df.dropna(inplace=True)

In [38]:
# 1. Relative Strength Index (RSI)
def calculate_rsi(series, window):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['rsi_14'] = calculate_rsi(df['close'], 14)

# 2. Exponential Moving Average (EMA)
df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()

# 3. Moving Average Convergence Divergence (MACD)
df['macd'] = df['ema_12'] - df['ema_26']
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()

# 4. Bollinger Bands
df['bollinger_up'] = df['ma20'] + (df['close'].rolling(window=20).std() * 2)
df['bollinger_down'] = df['ma20'] - (df['close'].rolling(window=20).std() * 2)

# 5. Volume Weighted Average Price (VWAP)
df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()

# 6. Lagged Features
df['close_lag1'] = df['close'].shift(1)
df['close_lag2'] = df['close'].shift(2)
df['close_lag5'] = df['close'].shift(5)

# 7. Temporal Features
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

# 8. Interaction Features
df['price_volume_interaction'] = df['price_change'] * df['volume']
df['high_low_ratio'] = df['high'] / df['low']

### **Goal:**

* Split the data into training and testing sets for Inference

In [39]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'Name',
       'price_change', 'high_low_range', 'volume_change', 'ma5', 'ma10',
       'ma20', 'close_shifted', 'rsi_14', 'ema_12', 'ema_26', 'macd',
       'macd_signal', 'bollinger_up', 'bollinger_down', 'vwap', 'close_lag1',
       'close_lag2', 'close_lag5', 'day_of_week', 'month', 'quarter',
       'price_volume_interaction', 'high_low_ratio'],
      dtype='object')

In [40]:
from sklearn.model_selection import train_test_split

# Features we want the model to train on for modeling
X = df[['date', 'open', 'high', 'low', 'close', 'volume', 'Name',
       'price_change', 'high_low_range', 'volume_change', 'ma5', 'ma10',
       'ma20', 'rsi_14', 'ema_12', 'ema_26', 'macd', 'macd_signal', 
        'bollinger_up', 'bollinger_down', 'vwap', 'close_lag1', 
        'close_lag2', 'close_lag5', 'day_of_week', 'month', 
        'quarter', 'price_volume_interaction', 'high_low_ratio']]

y = df['close_shifted']

# Splitting the Data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print(X_train.shape, X_test.shape)

(495207, 29) (123802, 29)


## Feature Scaling

## Model Training with XGBoost

In [41]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

In [42]:
# Drop non-numeric columns that XGBoost cannot handle directly
X_train_numeric = X_train.drop(columns=['date', 'Name'])
X_test_numeric = X_test.drop(columns=['date', 'Name'])

In [43]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Convert the numeric data into DMatrix format
train_data = xgb.DMatrix(X_train_numeric, label=y_train)
test_data = xgb.DMatrix(X_test_numeric, label=y_test)

# Define the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.01,  # Lower learning rate
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1,  # L2 regularization
    'alpha': 0.1,  # L1 regularization
    'seed': 42
}

# Train the model
num_round = 1000
xgb_model = xgb.train(params, train_data, num_boost_round=num_round, evals=[(test_data, 'eval')],
                      early_stopping_rounds=50)

print("Model training complete.")

[0]	eval-rmse:45.74333
[1]	eval-rmse:45.28811
[2]	eval-rmse:44.83729
[3]	eval-rmse:44.39032
[4]	eval-rmse:43.94843
[5]	eval-rmse:43.51100
[6]	eval-rmse:43.07823
[7]	eval-rmse:42.64877
[8]	eval-rmse:42.22401
[9]	eval-rmse:41.80311
[10]	eval-rmse:41.38724
[11]	eval-rmse:40.97563
[12]	eval-rmse:40.56780
[13]	eval-rmse:40.16460
[14]	eval-rmse:39.76477
[15]	eval-rmse:39.36920
[16]	eval-rmse:38.97723
[17]	eval-rmse:38.58835
[18]	eval-rmse:38.20421
[19]	eval-rmse:37.82362
[20]	eval-rmse:37.44709
[21]	eval-rmse:37.07480
[22]	eval-rmse:36.70601
[23]	eval-rmse:36.34153
[24]	eval-rmse:35.98007
[25]	eval-rmse:35.62223
[26]	eval-rmse:35.26798
[27]	eval-rmse:34.91663
[28]	eval-rmse:34.56953
[29]	eval-rmse:34.22610
[30]	eval-rmse:33.88581
[31]	eval-rmse:33.54966
[32]	eval-rmse:33.21674
[33]	eval-rmse:32.88713
[34]	eval-rmse:32.56037
[35]	eval-rmse:32.23718
[36]	eval-rmse:31.91659
[37]	eval-rmse:31.60003
[38]	eval-rmse:31.28567
[39]	eval-rmse:30.97532
[40]	eval-rmse:30.66787
[41]	eval-rmse:30.36410
[4

In [44]:
# Predict on the test set
y_pred = xgb_model.predict(test_data)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Additional Model Evaluation
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}")

Root Mean Squared Error (RMSE): 2.6670527669838164
Mean Squared Error (MSE): 7.113170461876031


## **Hyper-Param Tuning**

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# More features
df['rsi_14'] = calculate_rsi(df['close'], 14)
df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()
df['macd'] = df['ema_12'] - df['ema_26']
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
df['bollinger_up'] = df['ma20'] + (df['close'].rolling(window=20).std() * 2)
df['bollinger_down'] = df['ma20'] - (df['close'].rolling(window=20).std() * 2)
df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
df['close_lag1'] = df['close'].shift(1)
df['close_lag2'] = df['close'].shift(2)
df['close_lag5'] = df['close'].shift(5)
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['price_volume_interaction'] = df['price_change'] * df['volume']
df['high_low_ratio'] = df['high'] / df['low']

df.dropna(inplace=True)

# Prepare data for training
X = df[['open', 'high', 'low', 'close', 'volume', 'price_change', 'high_low_range', 
        'volume_change', 'ma5', 'ma10', 'ma20', 'rsi_14', 'ema_12', 'ema_26', 
        'macd', 'macd_signal', 'bollinger_up', 'bollinger_down', 'vwap', 
        'close_lag1', 'close_lag2', 'close_lag5', 'day_of_week', 'month', 
        'quarter', 'price_volume_interaction', 'high_low_ratio']]
y = df['close_shifted']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 5],
    'eta': [0.01, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'lambda': [1],
    'alpha': [0, 0.1]
}

# Initializing XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42, n_estimators=100)

# Setting up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, 
                           scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fitting GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE (negative MSE):", np.sqrt(-grid_search.best_score_))

# Training final model using best parameters
best_params = grid_search.best_params_
xgb_model_final = xgb.XGBRegressor(**best_params, objective='reg:squarederror', n_estimators=100, seed=42)
xgb_model_final.fit(X_train_scaled, y_train)

# Predicting and evaluating on test set
y_pred = xgb_model_final.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'alpha': 0, 'colsample_bytree': 0.8, 'eta': 0.1, 'lambda': 1, 'max_depth': 3, 'subsample': 0.8}
Best RMSE (negative MSE): 29.987652278464825
Test RMSE: 2.985220725880869
