## Model Development

#### import data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/df_fs.csv', sep=",", index_col=0)

In [3]:
df.head()

Unnamed: 0,date,Stock,Open,Low,Close,Volume,30_day_MA,60_day_MA,90_day_MA,SMA_30,...,EMA_90,RSI,EMA_12,EMA_26,MACD,Signal_Line,MACD_Histogram,Price_Change_Pct,Article Length,article_sentiment
0,2016-07-20,MSFT,56.84,55.529999,55.91,89893300.0,51.945666,52.309333,50.519333,51.945666,...,50.523063,71.721804,52.735685,52.177104,0.558581,0.040408,0.518174,5.311734,1394.0,1.0
1,2016-07-25,MSFT,56.740002,56.259998,56.73,25610600.0,52.099999,52.403333,50.659777,52.099999,...,50.65948,74.606512,53.350195,52.514356,0.83584,0.199494,0.636346,0.282835,623.0,1.0
2,2016-08-01,MSFT,56.75,56.139999,56.580002,26003400.0,52.249333,52.476499,50.799666,52.249333,...,50.789601,91.388238,53.847088,52.815515,1.031574,0.36591,0.665664,-0.176426,269.0,1.0
3,2016-08-03,MSFT,57.110001,56.490002,56.970001,22075600.0,52.411666,52.548499,50.945111,52.411666,...,50.925434,91.799325,54.327537,53.123254,1.204282,0.533585,0.670698,0.689288,904.0,1.0
4,2016-08-05,MSFT,58.209999,57.450001,57.959999,29335200.0,52.606999,52.658499,51.118444,52.606999,...,51.08004,92.480413,54.886377,53.481532,1.404845,0.707837,0.697009,0.993204,679.0,-2.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4983 entries, 0 to 4982
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               4983 non-null   object 
 1   Stock              4983 non-null   object 
 2   Open               4983 non-null   float64
 3   Low                4983 non-null   float64
 4   Close              4983 non-null   float64
 5   Volume             4983 non-null   float64
 6   30_day_MA          4983 non-null   float64
 7   60_day_MA          4983 non-null   float64
 8   90_day_MA          4983 non-null   float64
 9   SMA_30             4983 non-null   float64
 10  SMA_60             4983 non-null   float64
 11  SMA_90             4983 non-null   float64
 12  EMA_30             4983 non-null   float64
 13  EMA_60             4983 non-null   float64
 14  EMA_90             4983 non-null   float64
 15  RSI                4983 non-null   float64
 16  EMA_12             4983 non-n

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Define relevant features and the target variable

#additional_features = ["MACD_Histogram","RSI","MACD", 'EMA_12',"Article Length" ]
#additional_features = ['RSI', 'Volume', 'MACD_Histogram', 'Timestamp', 'Day', 'Signal_Line', 'Month', 'DayOfWeek', 'Close', 'Article Length', 'Open', 'Low']
#X = df[['article_sentiment'] + additional_features]  # Include 'article_sentiment' and other features

#X = df.drop(["Price_Change_Pct", "date","Price_Change_Category"], axis=1)  # Exclude target variable and date
X = df[['Open', 'Low', 'Close', '60_day_MA', 'RSI', 'MACD', 'MACD_Histogram', 'article_sentiment']]
#X= df[[  '30_day_MA', 'SMA_30', 'EMA_30','RSI', 'MACD', 'MACD_Histogram', 'Price_Change_Pct', 'Article Length']]
#X= df[['Close', 'Volume', 'SMA_30', 'RSI', 'MACD', 'MACD_Histogram', 'article_sentiment']]
y = df['Price_Change_Pct']  # Target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a linear regression model
lin_reg = LinearRegression()

# Fit the model to the training data
lin_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = lin_reg.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output model coefficients and performance metrics
coefficients = dict(zip(X.columns, lin_reg.coef_))  # Get model coefficients
print("Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

# Add a constant term to the features
X_train_with_const = sm.add_constant(X_train_scaled)

# Create and fit the model
model = sm.OLS(y_train, X_train_with_const)
results = model.fit()

# Print the summary
print("\nLinear Regression Summary:")
print(results.summary())


Evaluation Metrics:
Mean Absolute Error (MAE): 0.7903134051632317
Mean Squared Error (MSE): 1.6012902408812697
R-squared (R2): 0.6442192078734159

Linear Regression Summary:
                            OLS Regression Results                            
Dep. Variable:       Price_Change_Pct   R-squared:                       0.689
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     1102.
Date:                Tue, 30 Apr 2024   Prob (F-statistic):               0.00
Time:                        18:54:50   Log-Likelihood:                -6022.3
No. Observations:                3986   AIC:                         1.206e+04
Df Residuals:                    3977   BIC:                         1.212e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err    

### XGBRegressor 

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor  # Import XGBRegressor

# Define features and the target variable
X = df[['Open', 'Low', 'Close', '60_day_MA', 'RSI', 'MACD', 'MACD_Histogram', 'article_sentiment']]
y = df['Price_Change_Pct']  # Target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Standardize the training set
X_test_scaled = scaler.transform(X_test)  # Standardize the testing set

# Create the XGBRegressor model
xgb_reg = XGBRegressor(objective='reg:squarederror', random_state=42)  # 'reg:squarederror' for regression tasks

# Fit the model to the training data
xgb_reg.fit(X_train_scaled, y_train)  # Ensure the model is trained

# Predict on the test data
y_pred = xgb_reg.predict(X_test_scaled)  # Predict on the test set

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared

# Retrieve feature importance using the correct method
feature_importance = xgb_reg.get_booster().get_score(importance_type='weight')  # Get feature importance by weight

# Output model coefficients and performance metrics
print("Evaluation Metrics:")
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

print("\nFeature Importance:")
print(feature_importance)  # Display feature importance


Evaluation Metrics:
Mean Absolute Error (MAE): 0.5094086234515847
Mean Squared Error (MSE): 0.7527561996655483
R-squared (R2): 0.8327497475736734

Feature Importance:
{'f0': 854.0, 'f1': 437.0, 'f2': 483.0, 'f3': 421.0, 'f4': 720.0, 'f5': 677.0, 'f6': 621.0, 'f7': 122.0}


In [18]:
# Retrieve feature importance from the XGBoost model
feature_importance = xgb_reg.get_booster().get_score(importance_type='weight')

# Display the feature importance summary
print("Feature Importance:")
for feature, importance in feature_importance.items():
    print(f"{feature}: {importance}")


Feature Importance:
f0: 854.0
f1: 437.0
f2: 483.0
f3: 421.0
f4: 720.0
f5: 677.0
f6: 621.0
f7: 122.0


### lazypredict

In [19]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
import numpy as np


clf = LazyRegressor(verbose=0,predictions=True, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

  0%|          | 0/42 [00:00<?, ?it/s]

 19%|█▉        | 8/42 [00:01<00:07,  4.67it/s]


KeyboardInterrupt: 