In [3]:
import numpy as np
import pandas as pd
import os

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from scipy import stats
import itertools

# Part 1: Baseline Linear Model and Interpretation

In [4]:
# load day.csv 
df = pd.read_csv('day.csv')
df = df.drop(columns=['atemp', 'casual', 'registered'])
df['log_cnt'] = np.log(df['cnt'] + 1)

print(f"Dimensions: {df.shape}")
print(f"Columns: {list(df.columns)}\n")
df.info()
df.isna().sum()
df.describe()

categorical_cols = [ 'season', 'yr', 'mnth', 'weekday', 'weathersit', 'holiday', 'workingday' ]
numeric_cols = [ 'temp', 'hum', 'windspeed' ]

Dimensions: (731, 14)
Columns: ['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed', 'cnt', 'log_cnt']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
 13  log_cnt     731 non-null    float64
dtypes: float64(4), int64(9), object(1)
memory usage: 80.1+ KB


In [5]:
model = smf.ols(formula="cnt ~ temp + hum + windspeed + C(weathersit) + C(season) + C(yr) + C(weekday) + workingday + holiday", data=df).fit()
print(model.summary())

coef_table = pd.DataFrame({
    "coef": model.params,
    "std_err": model.bse,
    "p_value": model.pvalues
}).sort_values("p_value")

coef_table_rounded = coef_table.copy()
coef_table_rounded["coef"] = coef_table_rounded["coef"].round(3)
coef_table_rounded["std_err"] = coef_table_rounded["std_err"].round(3)
coef_table_rounded["p_value"] = coef_table_rounded["p_value"].apply(lambda x: f"{x:.3g}")
coef_table_rounded


                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.827
Model:                            OLS   Adj. R-squared:                  0.824
Method:                 Least Squares   F-statistic:                     213.9
Date:                Sat, 31 Jan 2026   Prob (F-statistic):          1.87e-259
Time:                        22:37:23   Log-Likelihood:                -5927.6
No. Observations:                 731   AIC:                         1.189e+04
Df Residuals:                     714   BIC:                         1.197e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           1313.5531    233

Unnamed: 0,coef,std_err,p_value
C(yr)[T.1],2017.136,61.139,1.0100000000000001e-145
temp,5090.745,304.038,2.49e-53
C(season)[T.4],1543.868,95.72,3.91e-50
C(season)[T.2],1139.255,112.437,1.23e-22
C(weathersit)[T.3],-1976.186,206.31,1.58e-20
windspeed,-2779.818,424.021,1.06e-10
C(season)[T.3],842.925,148.503,2e-08
Intercept,1313.553,233.542,2.67e-08
C(weathersit)[T.2],-450.895,80.706,3.29e-08
workingday,335.863,70.468,2.28e-06


In [6]:
log_model = smf.ols(formula="log_cnt ~ temp + hum + windspeed + C(weathersit) + C(season) + C(yr) + C(weekday) + workingday + holiday", data=df).fit()
print(log_model.summary())

coef_table_log = pd.DataFrame({
    "coef": log_model.params,
    "std_err": log_model.bse,
    "p_value": log_model.pvalues
}).sort_values("p_value")

coef_table_log_rounded = coef_table_log.copy()
coef_table_log_rounded["coef"] = coef_table_log_rounded["coef"].round(3)
coef_table_log_rounded["std_err"] = coef_table_log_rounded["std_err"].round(3)
coef_table_log_rounded["p_value"] = coef_table_log_rounded["p_value"].apply(lambda x: f"{x:.3g}")
coef_table_log_rounded

                            OLS Regression Results                            
Dep. Variable:                log_cnt   R-squared:                       0.747
Model:                            OLS   Adj. R-squared:                  0.742
Method:                 Least Squares   F-statistic:                     132.0
Date:                Sat, 31 Jan 2026   Prob (F-statistic):          1.10e-200
Time:                        22:37:23   Log-Likelihood:                -139.04
No. Observations:                 731   AIC:                             312.1
Df Residuals:                     714   BIC:                             390.2
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              7.4215      0

Unnamed: 0,coef,std_err,p_value
Intercept,7.422,0.085,0.0
C(yr)[T.1],0.465,0.022,3.48e-76
C(season)[T.4],0.489,0.035,8.82e-40
temp,1.509,0.111,8.01e-38
C(weathersit)[T.3],-0.968,0.075,2.4e-34
C(season)[T.2],0.353,0.041,4.29e-17
windspeed,-0.842,0.154,6.72e-08
C(season)[T.3],0.235,0.054,1.5e-05
workingday,0.1,0.026,0.000101
hum,-0.38,0.107,0.0004


# Part 2: Transformations and Diagnostics

In [7]:
# Fit baseline OLS model on: 
# 1. Residuals vs fitted values 
# 2. Residuals vs key predictors (temp, hum)
# 3. Q-Q plot of residuals 
# 4. Residual spread, leverage, influence, Cook's distance

def plot(input_model): 
    model_name = "model" if input_model == model else "log_model"

    fitted_vals = input_model.fittedvalues
    residuals = input_model.resid
    standardized_residuals = input_model.get_influence().resid_studentized_internal
    leverage = input_model.get_influence().hat_matrix_diag

    os.makedirs("./figures", exist_ok=True)

    # Residuals vs Fitted
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=fitted_vals, y=residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values')
    plt.savefig(f"./figures/{model_name}_residuals_vs_fitted.png")
    plt.close()

    # Residuals vs Temp
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df['temp'], y=residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Temperature')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Temperature')
    plt.savefig(f"./figures/{model_name}_residuals_vs_temp.png")
    plt.close()

    # Residuals vs Humidity
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df['hum'], y=residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Humidity')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Humidity')
    plt.savefig(f"./figures/{model_name}_residuals_vs_humidity.png")
    plt.close()

    # Q-Q Plot
    plt.figure(figsize=(10, 6))
    sm.qqplot(standardized_residuals, line ='45', fit=True)
    plt.title('Q-Q Plot of Standardized Residuals')
    plt.savefig(f"./figures/{model_name}_qq_plot.png")
    plt.close()

    # Residuals vs Leverage
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=leverage, y=standardized_residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Leverage')
    plt.ylabel('Standardized Residuals')
    plt.title('Standardized Residuals vs Leverage')
    plt.savefig(f"./figures/{model_name}_standardized_residuals_vs_leverage.png")
    plt.close()

    # Cook's Distance
    influence = model.get_influence()
    cooks_d = influence.cooks_distance[0]
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=np.arange(len(cooks_d)), y=cooks_d)
    plt.xlabel('Observation Index')
    plt.ylabel("Cook's Distance")
    plt.title("Cook's Distance for Each Observation")
    plt.savefig(f"./figures/{model_name}_cooks_distance.png")
    plt.close()

plot(model)
plot(log_model)

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

# Part 3: Collinearity

In [8]:
corr_matrix = df[numeric_cols].corr()
plt.Figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numeric Predictors')
plt.savefig("./figures/correlation_matrix.png")
plt.close()

X = log_model.model.exog
X_names = log_model.model.exog_names
vif_data = pd.DataFrame({
    "Variable": X_names,
    "VIF": [variance_inflation_factor(X, i) for i in range(X.shape[1])]
})
vif_data[vif_data["Variable"] != "Intercept"].sort_values("VIF", ascending=False)

vif_data

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Variable,VIF
0,Intercept,60.195265
1,C(weathersit)[T.2],1.60824
2,C(weathersit)[T.3],1.310736
3,C(season)[T.2],2.627952
4,C(season)[T.3],4.649735
5,C(season)[T.4],1.862739
6,C(yr)[T.1],1.031355
7,C(weekday)[T.1],inf
8,C(weekday)[T.2],inf
9,C(weekday)[T.3],inf
