In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression


In [28]:
# Load the processed data
train_datarf = pd.read_csv('../Data/output/train_data.csv')
test_datarf = pd.read_csv('../Data/output/test_data.csv')
validation_datarf = pd.read_csv('../Data/output/validation_data.csv')

train_datarf['Date'] = pd.to_datetime(train_datarf['Date'])
test_datarf['Date'] = pd.to_datetime(test_datarf['Date'])
validation_datarf['Date'] = pd.to_datetime(validation_datarf['Date'])


# Assuming that the first column is the index if saved with index=True
train_datarf.set_index(train_datarf.columns[0], inplace=True)
test_datarf.set_index(test_datarf.columns[0], inplace=True)
validation_datarf.set_index(validation_datarf.columns[0], inplace=True)


In [29]:
# Separating target variable and features
y_train = train_datarf['log_return']
X_train = train_datarf.drop(['log_return'], axis=1)

y_test = test_datarf['log_return']
X_test = test_datarf.drop(['log_return'], axis=1)

y_validation = validation_datarf['log_return']
X_validation = validation_datarf.drop(['log_return'], axis=1)

In [33]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_train, y_train)

# Predict on the training data
y_train_pred_linear = linear_model.predict(X_train)

# Predict on the testing data
y_test_pred_linear = linear_model.predict(X_test)

# Predict on the validation data ()
y_validation_pred_linear = linear_model.predict(X_validation)



In [34]:
# Evaluate the model

#training
linear_mse_training = mean_squared_error(y_train, y_train_pred_linear)
linear_mae_training = mean_absolute_error(y_train, y_train_pred_linear)
linear_r2_training = r2_score(y_train, y_train_pred_linear)

#test
linear_mse_testing = mean_squared_error(y_test, y_test_pred_linear)
linear_mae_testing = mean_absolute_error(y_test, y_test_pred_linear)
linear_r2_testing = r2_score(y_test, y_test_pred_linear)

#validation
linear_mse_validation = mean_squared_error(y_validation, y_validation_pred_linear)
linear_mae_validation = mean_absolute_error(y_validation, y_validation_pred_linear)
linear_r2_validation = r2_score(y_validation, y_validation_pred_linear)


In [35]:
# Display the results

print("Training data")
print("MSE:", linear_mse_training)
print("MAE:", linear_mae_training)
print("R2:", linear_r2_training)
print()


print("Testing data")
print("MSE:", linear_mse_testing)
print("MAE:", linear_mae_testing)
print("R2:", linear_r2_testing)
print()

print("Validation data")
print("MSE:", linear_mse_validation)
print("MAE:", linear_mae_validation)
print("R2:", linear_r2_validation)


Training data
MSE: 0.20763218708237594
MAE: 0.303131275761984
R2: 0.792367812917624

Testing data
MSE: 0.04919878647140979
MAE: 0.18179584994408504
R2: 0.9055725927799326

Validation data
MSE: 0.08355081658243925
MAE: 0.17618881126245053
R2: 0.897351904174426


In [36]:
#Saving the model


dump(linear_model, '../Data/models_output/linear_model.joblib')


['../Data/models_output/linear_model.joblib']