### Linear regression as predictive model

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from b_feature_engineering import df_prelag, df_clolag
import pandas as pd

##### Predicting closing price using closing price lags as exploratory variables

In [2]:
df_clolag.head(2)

Unnamed: 0,date,closing_price,clolag_1,clolag_2,clolag_3,clolag_4,clolag_5
0,2023-01-30,109.76,109.85,108.83,108.05,108.61,108.66
1,2023-01-31,110.06,109.76,109.85,108.83,108.05,108.61


In [3]:
# This function evaluate linear model for the initial train size and iterating forward creating models for each day
# Prediction is done for each model using model created in each iteration
def linear_model(df, train_size = 365, target = '', index = 'date'):
    preds = []
    actuals = []
    dates = []
    X = df.drop(columns= [index, target])
    Y = df[target]
  
    for i in range(train_size, len(df)):
        X_train = X.iloc[:i]
        y_train = Y.iloc[:i]
        X_test = X.iloc[i:i+1]
        y_test = Y.iloc[i:i+1]

        # simple model: linear regression
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        preds.append(y_pred[0])
        actuals.append(y_test.values[0])
        dates.append(df.loc[i,index])

    df_results = (pd.DataFrame({
                    "date": dates,
                    "actual": actuals,
                    "predicted": preds }))
    return df_results



In [None]:
df_eval1 = df_clolag.copy()
df_eval = linear_model(df = df_eval1, target = 'closing_price')
df_eval.head(2)

Unnamed: 0,date,actual,predicted
0,2024-07-10,128.54,128.869158
1,2024-07-11,129.92,128.442528


In [5]:
# ---- Evaluate performance ----
rmse = root_mean_squared_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Root_Mean_Squared_Error (RMSE): {rmse:.2f}")

mae = mean_absolute_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Mean Absolute Error (MAE): {mae:.2f}")

Walk-forward Root_Mean_Squared_Error (RMSE): 2.13
Walk-forward Mean Absolute Error (MAE): 1.60


##### Linear regression after removing the trend component

In [6]:
df_eval2 = df_prelag.copy()
df_eval2.head(1)

Unnamed: 0,date,closing_price,clolag_1,price_diff,prilag_1,prilag_2,prilag_3,prilag_4,prilag_5
0,2023-02-06,110.75,111.15,-0.4,1.42,-0.42,0.09,0.3,-0.09


In [7]:
df_eval = linear_model(df = df_prelag.drop(columns=["closing_price", "clolag_1"]), target = 'price_diff')
df_eval.head(2)

Unnamed: 0,date,actual,predicted
0,2024-07-17,1.06,0.025177
1,2024-07-18,-0.84,0.069032


In [8]:
# ---- Evaluate performance ----
rmse = root_mean_squared_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Root_Mean_Squared_Error (RMSE): {rmse:.2f}")

mae = mean_absolute_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Mean Absolute Error (MAE): {mae:.2f}")

Walk-forward Root_Mean_Squared_Error (RMSE): 2.14
Walk-forward Mean Absolute Error (MAE): 1.61


The results from both regression models are the same. The prediction accuracy of both models are less than the first base line model. 