In [132]:
import pandas as pd
import statsmodels.api as sm
import os
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [133]:
# print("Current working directory:", os.getcwd())

df = pd.read_csv("data.csv", parse_dates=["Date"]) #opent the data file
df = df.dropna(subset=["Return"])  # drop missing returns if there is any left

usefull date data

In [134]:
################# setting up what dates give us
df["DayOfWeek"] = df["Date"].dt.dayofweek        # 0 = Monday, 6 = Sunday
df["IsWeekend"] = df["DayOfWeek"].isin([5, 6]).astype(int)

df["Month"] = df["Date"].dt.month                # 1 = January, 0 = Others
df["IsJanuary"] = (df["Month"] == 1).astype(int)

df["Day"] = df["Date"].dt.day
df["IsMonthStart"] = (df["Day"] <= 3).astype(int)  # e.g. first 3 days of month

df["DayOfYear"] = df["Date"].dt.dayofyear       # 1 = Jan 1, 365/366 = Dec 31

 OLS Regression

In [135]:
#Set up OLS regression
# X = df[["IsWeekend", "IsJanuary", "IsMonthStart", "Month", "Day", "DayOfYear"]]  #cant do this one cause a lot of multicollinearity appears
X = df[["IsWeekend", "IsJanuary", "IsMonthStart", "DayOfYear"]] 
X = sm.add_constant(X)
y = df["Return"]

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.1522
Date:                Sat, 01 Nov 2025   Prob (F-statistic):              0.928
Time:                        05:55:21   Log-Likelihood:                 10398.
No. Observations:                4839   AIC:                        -2.079e+04
Df Residuals:                    4835   BIC:                        -2.076e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0004      0.001      0.410   

Durbin-Watson for OLS

In [136]:
from statsmodels.stats.stattools import durbin_watson

# Assuming 'ols_model' is your fitted OLS model
dw_stat = durbin_watson(model.resid)
print("Durbin-Watson statistic:", dw_stat)
print("About 2 is good")


Durbin-Watson statistic: 1.8243988991614501
About 2 is good


Variance Inflation Factor (VIF) checks for multicollinearity among the predictors in OLS.

In [137]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("everything < 10 is good:")
print(vif_data)


everything < 10 is good:
        feature       VIF
0         const  5.912864
1     IsWeekend       NaN
2     IsJanuary  1.288424
3  IsMonthStart  1.003648
4     DayOfYear  1.288547


  return 1 - self.ssr/self.centered_tss


Logit (does same stuff as OLS but with binary output)

In [138]:
# 1 if return is positive, 0 if return is zero or negative
df["Return_Positive"] = (df["Return"] > 0).astype(int)

# Set up X and y
# X_logit = sm.add_constant(df[["DayOfYear"]])  # add intercept
X_logit = df[["DayOfYear", "IsMonthStart", "IsJanuary"]] #cant cant add more cause a lot of multicollinearity appears
X_logit = sm.add_constant(X_logit)  # add intercept
y = df["Return_Positive"]

# Fit Logit
logit_model = sm.Logit(y, X_logit).fit()
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.690707
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:        Return_Positive   No. Observations:                 4839
Model:                          Logit   Df Residuals:                     4835
Method:                           MLE   Df Model:                            3
Date:                Sat, 01 Nov 2025   Pseudo R-squ.:               0.0005407
Time:                        05:55:21   Log-Likelihood:                -3342.3
converged:                       True   LL-Null:                       -3344.1
Covariance Type:            nonrobust   LLR p-value:                    0.3060
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0421      0.070     -0.602      0.547      -0.179       0.095
DayOfYear       -0.0005

Variance Inflation Factor (VIF) checks for multicollinearity among the predictors in Logit.

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_logit.columns
vif_data["VIF"] = [variance_inflation_factor(X_logit.values, i) for i in range(X_logit.shape[1])]
print("everything < 10 is good:")
print(vif_data)

everything < 10 is good:
        feature       VIF
0         const  5.912864
1     DayOfYear  1.288547
2  IsMonthStart  1.003648
3     IsJanuary  1.288424
