### Linear regression as predictive model

In [4]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from b_feature_engineering import df_prelag, df_clolag
import pandas as pd

##### Predicting closing price using closing price lags as exploratory variables

In [5]:
df_clolag.head(2)

Unnamed: 0,date,closing_price,clolag_1,clolag_2,clolag_3,clolag_4,clolag_5
0,2023-05-04,95.76,98.29,100.1,100.44,99.36,99.1
1,2023-05-05,96.13,95.76,98.29,100.1,100.44,99.36


In [6]:
# This function evaluate linear model for the initial train size and iterating forward creating models for each day
# Prediction is done for each model using model created in each iteration
def linear_model(df, train_size = 365, target = '', index = 'date'):
    preds = []
    actuals = []
    dates = []
    X = df.drop(columns= [index, target])
    Y = df[target]
  
    for i in range(train_size, len(df)):
        X_train = X.iloc[:i]
        y_train = Y.iloc[:i]
        X_test = X.iloc[i:i+1]
        y_test = Y.iloc[i:i+1]

        # simple model: linear regression
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        preds.append(y_pred[0])
        actuals.append(y_test.values[0])
        dates.append(df.loc[i,index])

    df_results = (pd.DataFrame({
                    "date": dates,
                    "actual": actuals,
                    "predicted": preds }))
    return df_results



In [7]:
df_eval1 = df_clolag.copy()
df_eval = linear_model(df = df_eval1, target = 'closing_price')
df_eval.head(2)

Unnamed: 0,date,actual,predicted
0,2024-10-09,135.63,135.090895
1,2024-10-10,136.19,135.640492


In [8]:
# ---- Evaluate performance ----
rmse = root_mean_squared_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Root_Mean_Squared_Error (RMSE): {rmse:.2f}")

mae = mean_absolute_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Mean Absolute Error (MAE): {mae:.2f}")

Walk-forward Root_Mean_Squared_Error (RMSE): 2.23
Walk-forward Mean Absolute Error (MAE): 1.66


In [None]:
# Save RMSE and MAE to validation table for closing price model
import os
import csv

val_dir = os.path.abspath(os.path.join(os.path.dirname("../.."), "docs/validations"))
os.makedirs(val_dir, exist_ok=True)
val_path = os.path.join(val_dir, "linear_model_results.csv")

header = ["model", "target", "rmse", "mae"]
row = ["linear_regression", "closing_price", rmse, mae]

# Write header if file does not exist
write_header = not os.path.exists(val_path)
with open(val_path, "a", newline="") as f:
    writer = csv.writer(f)
    if write_header:
        writer.writerow(header)
    writer.writerow(row)
print(f"Validation results saved to {val_path}")

##### Linear regression after removing the trend component

In [9]:
df_eval2 = df_prelag.copy()
df_eval2.head(1)

Unnamed: 0,date,closing_price,clolag_1,price_diff,prilag_1,prilag_2,prilag_3,prilag_4,prilag_5
0,2023-05-11,98.35,97.85,0.5,0.51,0.22,0.99,0.37,-2.53


In [10]:
df_eval = linear_model(df = df_prelag.drop(columns=["closing_price", "clolag_1"]), target = 'price_diff')
df_eval.head(2)

Unnamed: 0,date,actual,predicted
0,2024-10-16,0.51,0.153511
1,2024-10-17,2.2,0.040056


In [11]:
# ---- Evaluate performance ----
rmse = root_mean_squared_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Root_Mean_Squared_Error (RMSE): {rmse:.2f}")

mae = mean_absolute_error(df_eval.actual, df_eval.predicted)
print(f"Walk-forward Mean Absolute Error (MAE): {mae:.2f}")

Walk-forward Root_Mean_Squared_Error (RMSE): 2.24
Walk-forward Mean Absolute Error (MAE): 1.67


In [None]:
# Save RMSE and MAE to validation table for price_diff model
row = ["linear_regression", "price_diff", rmse, mae]

# Write header if file does not exist
write_header = not os.path.exists(val_path)
with open(val_path, "a", newline="") as f:
    writer = csv.writer(f)
    if write_header:
        writer.writerow(header)
    writer.writerow(row)
print(f"Validation results saved to {val_path}")

The results from both regression models are the same. The prediction accuracy of both models are less than the first base line model. 