In [None]:
import pandas as pd
import statsmodels.api as sm
import os
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# print("Current working directory:", os.getcwd())

df = pd.read_csv("data.csv", parse_dates=["Date"]) #opent the data file
df = df.dropna(subset=["Return"])  # drop missing returns if there is any left

usefull date data

In [None]:
################# setting up what dates give us
df["DayOfWeek"] = df["Date"].dt.dayofweek        # 0 = Monday, 6 = Sunday
df["IsWeekend"] = df["DayOfWeek"].isin([5, 6]).astype(int)

df["Month"] = df["Date"].dt.month                # 1 = January, 0 = Others
df["IsJanuary"] = (df["Month"] == 1).astype(int)

df["Day"] = df["Date"].dt.day
df["IsMonthStart"] = (df["Day"] <= 3).astype(int)  # e.g. first 3 days of month

df["DayOfYear"] = df["Date"].dt.dayofyear       # 1 = Jan 1, 365/366 = Dec 31

 OLS Regression

In [None]:
#Set up OLS regression
# X = df[["IsWeekend", "IsJanuary", "IsMonthStart", "Month", "Day", "DayOfYear"]]  #cant do this one cause a lot of multicollinearity appears
X = df[["IsWeekend", "IsJanuary", "IsMonthStart", "DayOfYear"]] 
X = sm.add_constant(X)
y = df["Return"]

model = sm.OLS(y, X).fit()
print(model.summary())

Durbin-Watson for OLS

In [None]:
from statsmodels.stats.stattools import durbin_watson

# Assuming 'ols_model' is your fitted OLS model
dw_stat = durbin_watson(model.resid)
print("Durbin-Watson statistic:", dw_stat)
print("About 2 is good")


Variance Inflation Factor (VIF) checks for multicollinearity among the predictors in OLS.

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("everything < 10 is good:")
print(vif_data)


Logit (does same stuff as OLS but with binary output)

In [None]:
# 1 if return is positive, 0 if return is zero or negative
df["Return_Positive"] = (df["Return"] > 0).astype(int)

# Set up X and y
# X_logit = sm.add_constant(df[["DayOfYear"]])  # add intercept
X_logit = df[["DayOfYear", "IsMonthStart", "IsJanuary"]] #cant cant add more cause a lot of multicollinearity appears
X_logit = sm.add_constant(X_logit)  # add intercept
y = df["Return_Positive"]

# Fit Logit
logit_model = sm.Logit(y, X_logit).fit()
print(logit_model.summary())

Variance Inflation Factor (VIF) checks for multicollinearity among the predictors in Logit.

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_logit.columns
vif_data["VIF"] = [variance_inflation_factor(X_logit.values, i) for i in range(X_logit.shape[1])]
print("everything < 10 is good:")
print(vif_data)