In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
from sklearn import metrics

np.random.seed(1)

In [8]:
def get_lin_reg_model(model_formula, df_in, print_MSE=False, return_MSE=False):
    """
    Function returns the summary for fitted linear model.

    Parameter "model_formula" should be a patsy formula describing the model.
    Parameter "df" is a dataframe.
    """

    # Split the data into training (80%) and validation set (20%)
    mask = np.random.rand(len(df_in)) < 0.8
    train = df_in[mask]
    valid = df_in[~mask]

    # Prepare the data (dmatrices is from patsy library)
    y_train, X_train = dmatrices(model_formula, data=train, return_type='dataframe')
    y_valid, X_valid = dmatrices(model_formula, data=valid, return_type='dataframe')

    # Train the model
    model = sm.OLS(y_train, X_train)
    fitted_model = model.fit()
    y_train_pred = fitted_model.predict(X_train)
    train_MSE = metrics.mean_squared_error(y_train, y_train_pred)
    y_valid_pred = fitted_model.predict(X_valid)
    test_MSE = metrics.mean_squared_error(y_valid, y_valid_pred)

    if print_MSE is True:
        # Show MSE for training set
        print(f'{train_MSE=}')

        # Show MSE for validation set
        print(f'{test_MSE=}\n')

    # Return fitted model
    if return_MSE is True:
        return fitted_model, train_MSE, test_MSE
    else:
        return fitted_model

In [9]:
PATH = "lab/data/"
df_raw = pd.read_csv(f'{PATH}Auto.csv')

In [10]:
df_raw.isnull().sum() # checking for nulls

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [14]:
# horsepower has some missing ('?') values
bad_rows = []
for index, row in df_raw.iterrows():
    if row['horsepower'] == "?":
        bad_rows.append(index)
df_raw = df_raw.drop(bad_rows)
print(f"dropped: {len(bad_rows)}")

dropped: 0


In [15]:
df_cont = df_raw.astype({"horsepower": 'int'})
df_cat = df_raw.astype({"horsepower": 'int', "year": 'category'})

In [16]:
year_continuous_model = get_lin_reg_model("mpg ~ year", df_cont, print_MSE=True)
year_categorical_model = get_lin_reg_model("mpg ~ year", df_cat, print_MSE=True)

train_MSE=36.872523775660405
test_MSE=55.92917098803293

train_MSE=35.05528733317721
test_MSE=35.11147796060024



In [17]:
year_continuous_model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.364
Model:,OLS,Adj. R-squared:,0.362
Method:,Least Squares,F-statistic:,182.4
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,3.52e-33
Time:,22:06:38,Log-Likelihood:,-1034.5
No. Observations:,321,AIC:,2073.0
Df Residuals:,319,BIC:,2080.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-70.7609,6.967,-10.157,0.000,-84.467,-57.055
year,1.2373,0.092,13.504,0.000,1.057,1.418

0,1,2,3
Omnibus:,20.668,Durbin-Watson:,0.886
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12.775
Skew:,0.345,Prob(JB):,0.00168
Kurtosis:,2.308,Cond. No.,1560.0


In [18]:
year_categorical_model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.391
Model:,OLS,Adj. R-squared:,0.365
Method:,Least Squares,F-statistic:,15.13
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,1.7699999999999998e-24
Time:,22:06:40,Log-Likelihood:,-946.43
No. Observations:,296,AIC:,1919.0
Df Residuals:,283,BIC:,1967.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,17.3889,1.427,12.184,0.000,14.580,20.198
year[T.71],3.5159,1.945,1.808,0.072,-0.313,7.344
year[T.72],1.5635,1.945,0.804,0.422,-2.265,5.392
year[T.73],-0.3007,1.765,-0.170,0.865,-3.775,3.174
year[T.74],4.5202,1.924,2.349,0.020,0.732,8.308
year[T.75],2.6481,1.843,1.437,0.152,-0.979,6.275
year[T.76],4.1546,1.906,2.180,0.030,0.404,7.905
year[T.77],7.2500,2.018,3.592,0.000,3.277,11.223
year[T.78],7.0444,1.843,3.823,0.000,3.418,10.671

0,1,2,3
Omnibus:,16.69,Durbin-Watson:,0.941
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12.118
Skew:,0.384,Prob(JB):,0.00234
Kurtosis:,2.372,Cond. No.,15.6


In [19]:
print(df_cat['year'])

0      70
1      70
2      70
3      70
4      70
       ..
392    82
393    82
394    82
395    82
396    82
Name: year, Length: 392, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]
