## Model Development

#### import data

In [236]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [237]:
df = pd.read_csv('data/df_fs.csv', sep=",", index_col=0)

In [238]:
df.head()

Unnamed: 0,date,Stock,Open,Low,Close,Volume,30_day_MA,60_day_MA,90_day_MA,SMA_30,...,EMA_90,RSI,EMA_12,EMA_26,MACD,Signal_Line,MACD_Histogram,Price_Change_Pct,Article Length,article_sentiment
0,2016-07-20,MSFT,56.84,55.53,55.91,89893300.0,51.95,52.31,50.52,51.95,...,50.52,71.72,52.74,52.18,0.56,0.04,0.52,5.31,1394.0,1.0
1,2016-07-25,MSFT,56.74,56.26,56.73,25610600.0,52.1,52.4,50.66,52.1,...,50.66,74.61,53.35,52.51,0.84,0.2,0.64,0.28,623.0,1.0
2,2016-08-01,MSFT,56.75,56.14,56.58,26003400.0,52.25,52.48,50.8,52.25,...,50.79,91.39,53.85,52.82,1.03,0.37,0.67,-0.18,269.0,1.0
3,2016-08-03,MSFT,57.11,56.49,56.97,22075600.0,52.41,52.55,50.95,52.41,...,50.93,91.8,54.33,53.12,1.2,0.53,0.67,0.69,904.0,1.0
4,2016-08-05,MSFT,58.21,57.45,57.96,29335200.0,52.61,52.66,51.12,52.61,...,51.08,92.48,54.89,53.48,1.4,0.71,0.7,0.99,679.0,-2.0


In [239]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4983 entries, 0 to 4982
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               4983 non-null   object 
 1   Stock              4983 non-null   object 
 2   Open               4983 non-null   float64
 3   Low                4983 non-null   float64
 4   Close              4983 non-null   float64
 5   Volume             4983 non-null   float64
 6   30_day_MA          4983 non-null   float64
 7   60_day_MA          4983 non-null   float64
 8   90_day_MA          4983 non-null   float64
 9   SMA_30             4983 non-null   float64
 10  SMA_60             4983 non-null   float64
 11  SMA_90             4983 non-null   float64
 12  EMA_30             4983 non-null   float64
 13  EMA_60             4983 non-null   float64
 14  EMA_90             4983 non-null   float64
 15  RSI                4983 non-null   float64
 16  EMA_12             4983 non-n

In [240]:
#df = df[['article_sentiment', 'Volume', 'RSI','MACD_Histogram','Close','30_day_MA', 'SMA_30', 'EMA_30', 'MACD', 'Article Length', 'Price_Change_Pct']]

In [241]:
#X = df.drop(["Price_Change_Pct"], axis=1)
X = df[['article_sentiment',  'Open', 'Close', '30_day_MA','SMA_60', 'MACD' ]]

#Backward Elimiation 
#X = df[['article_sentiment','Volume', 'RSI', '30_day_MA', 'SMA_60', 'EMA_90', 'MACD']]
y = df['Price_Change_Pct']  # Target variable

#### Multicollinearity Test

In [242]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming `X` is your DataFrame of predictors including a constant
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


             feature      VIF
0  article_sentiment     1.09
1               Open 15296.16
2              Close 18529.31
3          30_day_MA 10488.68
4             SMA_60 12160.32
5               MACD    12.35


### Linear Regression

In [243]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a linear regression model
lin_reg = LinearRegression()

# Fit the model to the training data
lin_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = lin_reg.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output model coefficients and performance metrics
coefficients = dict(zip(X.columns, lin_reg.coef_))  # Get model coefficients
print("Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

# Add a constant term to the features
X_train_with_const = sm.add_constant(X_train_scaled)

# Create and fit the model
model = sm.OLS(y_train, X_train_with_const)
results = model.fit()

# Print the summary
print("\nLinear Regression Summary:")
print(results.summary())


Evaluation Metrics:
Mean Absolute Error (MAE): 0.8296531592069291
Mean Squared Error (MSE): 1.81852587634137
R-squared (R2): 0.5959529632608336

Linear Regression Summary:
                            OLS Regression Results                            
Dep. Variable:       Price_Change_Pct   R-squared:                       0.648
Model:                            OLS   Adj. R-squared:                  0.647
Method:                 Least Squares   F-statistic:                     1219.
Date:                Thu, 02 May 2024   Prob (F-statistic):               0.00
Time:                        13:49:19   Log-Likelihood:                -6271.8
No. Observations:                3986   AIC:                         1.256e+04
Df Residuals:                    3979   BIC:                         1.260e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err      

### Lasso Regression 

In [244]:
from sklearn.linear_model import LassoCV

# Creating and fitting the model
lasso_cv = LassoCV(cv=5, random_state=42)  # 5-fold cross-validation
lasso_cv.fit(X_train_scaled, y_train)

# Optimal alpha
print("Optimal alpha:", lasso_cv.alpha_)

# Evaluate the model with optimal alpha
y_pred_cv = lasso_cv.predict(X_test_scaled)
print("Testing MSE:", mean_squared_error(y_test, y_pred_cv))
print("Testing R2:", r2_score(y_test, y_pred_cv))

# Check the coefficients
print("Coefficients:", lasso_cv.coef_)


Optimal alpha: 0.0006683107282641673
Testing MSE: 2.4680856789170202
Testing R2: 0.4516312811610501
Coefficients: [-1.94559274e-02 -1.60301036e-02  2.47072938e+01 -2.53940704e+01
  6.47372785e-01 -9.71651083e-01]


In [245]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Create a Lasso regressor object and fit it to the training data.
lasso = Lasso(alpha=0.0009651091796985053)  # Alpha is a hyperparameter; adjust it based on validation
lasso.fit(X_train_scaled, y_train)

# Predict on the training and test data.
y_train_pred = lasso.predict(X_train_scaled)
y_test_pred = lasso.predict(X_test_scaled)


# Calculate metrics
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Training MSE: {:.2f}, Testing MSE: {:.2f}".format(mse_train, mse_test))
print("Training R2: {:.2f}, Testing R2: {:.2f}".format(r2_train, r2_test))

print("Coefficients:", lasso.coef_)


Training MSE: 1.93, Testing MSE: 2.47
Training R2: 0.50, Testing R2: 0.45
Coefficients: [-1.93368682e-02 -2.69181969e-04  2.45824262e+01 -2.50234598e+01
  3.85361852e-01 -9.72357684e-01]


### XGBRegressor 

In [246]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor  # Import XGBRegressor

# Define features and the target variable
#X = df[['Open', 'Low', 'Close', '60_day_MA', 'RSI', 'MACD', 'MACD_Histogram', 'article_sentiment']]
#y = df['Price_Change_Pct']  # Target variable



# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Standardize the training set
X_test_scaled = scaler.transform(X_test)  # Standardize the testing set

# Create the XGBRegressor model
xgb_reg = XGBRegressor(objective='reg:squarederror', random_state=42)  # 'reg:squarederror' for regression tasks

# Fit the model to the training data
xgb_reg.fit(X_train_scaled, y_train)  # Ensure the model is trained

# Predict on the test data
y_pred = xgb_reg.predict(X_test_scaled)  # Predict on the test set

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared

# Retrieve feature importance using the correct method
feature_importance = xgb_reg.get_booster().get_score(importance_type='weight')  # Get feature importance by weight

# Output model coefficients and performance metrics
print("Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

print("\nFeature Importance:")
print(feature_importance)  # Display feature importance


Evaluation Metrics:
Mean Absolute Error (MAE): 0.5568163821620444
Mean Squared Error (MSE): 0.9353392647001318
R-squared (R2): 0.7921827436361777

Feature Importance:
{'f0': 312.0, 'f1': 1100.0, 'f2': 842.0, 'f3': 704.0, 'f4': 367.0, 'f5': 783.0}


### lazypredict

In [100]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
import numpy as np


clf = LazyRegressor(verbose=0,predictions=True, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 42/42 [00:19<00:00,  2.17it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 3986, number of used features: 10
[LightGBM] [Info] Start training from score 0.175068
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
ExtraTreesRegressor                          0.83       0.83  0.88        1.36
XGBRegressor                                 0.82       0.82  0.90        0.12
RandomForestRegressor                        0.79       0.79  0.98        5.69
BaggingRegressor                             0.76       0.76  1.04        0.58
HistGradientBoostingRegressor                0.73       0.73  1.10        0.47
LGBMRegressor                                0.72       0.72  1.12        0.08
GradientBoostingRegressor  


