Overfitting

In [15]:
# Libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV

In [16]:
# prepare data csv file
file_path = 'boston.csv.xls'
df = pd.read_csv(file_path)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [17]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [18]:
# training set
np.random.seed(11111)  # Random seed
nrow = df.shape[0]
train_sequence = sorted(np.random.choice(nrow, int(nrow * 0.8), replace=False))
test_sequence = sorted(set(list(range(0, nrow))) - set(train_sequence))

train = df.filter(items=train_sequence, axis=0)
test = df.filter(items=test_sequence, axis=0)

# amke X matrix to be in matrix form and Y to be in vector form
ind_var = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']

# train x set
train_x = train.loc[:, ind_var]
train_y = train.MEDV

# train test set
test_x = test.loc[:, ind_var]
test_y = test.MEDV

test_tot = test.loc[:, ['MEDV','CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']]

In [19]:
# OLS regression
# OLS Regression
ols_final = sm.OLS(train_y, sm.add_constant(train_x)).fit()
print(ols_final.summary2().tables[1])  # print only coefficients

# Compute test R^2 and test mean squared error
ols_pred = ols_final.predict(sm.add_constant(test_x))
ols_pred = pd.DataFrame(ols_pred, columns=["ols_p"])
ols_actual = test.MEDV

ols_rss = np.sum(np.power(ols_pred.ols_p - ols_actual, 2))
ols_tss = np.sum(np.power(ols_actual - np.mean(ols_actual), 2))
ols_rsq = 1 - (ols_rss / ols_tss)
print("\n OLS_R^2", ols_rsq)

ols_MSE = np.sqrt(ols_rss / test.shape[0])
print(" OLS_SME", ols_MSE)

             Coef.  Std.Err.         t         P>|t|     [0.025     0.975]
const    37.101250  5.419413  6.845991  2.961232e-11  26.446331  47.756169
CRIM     -0.126118  0.032020 -3.938772  9.701752e-05  -0.189071  -0.063165
ZN        0.055675  0.015123  3.681458  2.645678e-04   0.025942   0.085407
INDUS     0.012467  0.067457  0.184818  8.534677e-01  -0.120157   0.145091
CHAS      1.512590  0.897875  1.684633  9.285951e-02  -0.252692   3.277871
NOX     -16.827781  3.941698 -4.269170  2.466730e-05 -24.577417  -9.078144
RM        3.736707  0.443301  8.429284  6.800570e-16   2.865149   4.608265
AGE       0.006566  0.014094  0.465903  6.415452e-01  -0.021143   0.034275
DIS      -1.457818  0.216429 -6.735768  5.864060e-11  -1.883332  -1.032303
RAD       0.336162  0.069879  4.810593  2.153986e-06   0.198774   0.473549
TAX      -0.014090  0.004076 -3.457293  6.057182e-04  -0.022103  -0.006078
PTRATIO  -0.949442  0.141872 -6.692264  7.661573e-11  -1.228370  -0.670513
B         0.006998  0.002

Ridge Regression

In [20]:
# generate a sequence of lambdas to try
lambdas = [np.power(10, i) for i in np.arange(4, -4, -0.1)]
alphas = lambdas

# Use 10-fold Cross Validation to find optimal lambda
ridge_cv = RidgeCV(alphas=alphas, cv=10, scoring="neg_mean_squared_error")
ridge_cv.fit(train_x, train_y)

# Build final ridge regression model
ridge_final = Ridge(alpha=ridge_cv.alpha_, fit_intercept=True)
ridge_final.fit(train_x, train_y)

# Print coefficients
# print('Intercept:', ridge_final.intercept_)
print(
    "\n",
    pd.DataFrame(
        (ridge_final.coef_),
        index=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
        columns=["Coef."],
    ),
)

# R squared formula and mean squared error
ridge_pred = ridge_final.predict(test_x)
ridge_actual = test.MEDV
ridge_rss = np.sum(np.power(ridge_pred - ridge_actual, 2))
ridge_tss = np.sum(np.power(ridge_actual - np.mean(ridge_actual), 2))
ridge_rsq = 1 - ridge_rss / ridge_tss
print("\n Ridge_R^2", ridge_rsq)

ridge_MSE = np.sqrt(ridge_rss / test.shape[0])
print("Ridge_SME", ridge_MSE)


             Coef.
CRIM    -0.117408
ZN       0.063275
INDUS   -0.075161
CHAS     0.308877
NOX     -0.173820
RM       1.915684
AGE      0.012659
DIS     -1.094026
RAD      0.350916
TAX     -0.017656
PTRATIO -0.804222
B        0.007417
LSTAT   -0.665527

 Ridge_R^2 0.6657210397051739
Ridge_SME 5.845188291973453


Lasso analysis

In [21]:
# generate a sequence of lambdas to try
lambdas = [np.power(10, i) for i in np.arange(6, -6, -0.1)]

# Compile model
lasso_cv = LassoCV(cv=10, alphas=lambdas)
lasso_cv.fit(train_x, train_y)  # Fit Model

# Scale
# train_x_scale = scale(train_x) #In case you want to scale the variables.

# Build final LASSO regression model
lasso_final = Lasso(alpha=lasso_cv.alpha_, fit_intercept=True)
lasso_final.fit(train_x, train_y)

# Print results
# print('Intercept:', lasso_final.intercept_)
print(
    "\n",
    pd.DataFrame(
        (lasso_final.coef_),
        index=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
        columns=["Coef."],
    ),
)

# R squared formula and mean squared error
lasso_pred = lasso_final.predict(test_x)
lasso_actual = test.MEDV
lasso_rss = np.sum(np.power(lasso_pred - lasso_actual, 2))
lasso_tss = np.sum(np.power(lasso_actual - np.mean(lasso_actual), 2))
lasso_rsq = 1 - lasso_rss / lasso_tss
print("\n LASSO_R^2: ", lasso_rsq)

lasso_MSE = np.sqrt(lasso_rss / test.shape[0])
print("LASSO_SME: ", lasso_MSE)


             Coef.
CRIM    -0.102448
ZN       0.056886
INDUS   -0.024506
CHAS     0.000000
NOX     -0.000000
RM       2.527210
AGE      0.009518
DIS     -0.923285
RAD      0.306620
TAX     -0.016944
PTRATIO -0.726300
B        0.007589
LSTAT   -0.629595

 LASSO_R^2:  0.6683704868750528
LASSO_SME:  5.821978147733223


In [26]:
# to sum all the result in one output

OLS_df = pd.DataFrame(ols_final.summary2().tables[1]["Coef."]).rename(
    columns={"Coef.": "OLS"}
)

Ridge_df = pd.DataFrame(
    np.insert(ridge_final.coef_, 0, ridge_final.intercept_),
    index=['MEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
    columns=["Ridge"],
)

Lasso_df = pd.DataFrame(
    np.insert(lasso_final.coef_, 0, lasso_final.intercept_),
    index=['MEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
    columns=["Lasso"],
)

df_sum = OLS_df.merge(Ridge_df, left_index=True, right_index=True)
df_sum = df_sum.merge(Lasso_df, left_index=True, right_index=True)

df_sum = pd.concat([
        df_sum,
        pd.DataFrame(
            {
                "OLS": [ols_rsq, ols_MSE],
                "Ridge": [ridge_rsq, ridge_MSE],
                "Lasso": [lasso_rsq, lasso_MSE],
            },
            index=["R sq", "Mean Sq. Err"]
        )
    ])

df_sum

Unnamed: 0,OLS,Ridge,Lasso
CRIM,-0.126118,-0.117408,-0.102448
ZN,0.055675,0.063275,0.056886
INDUS,0.012467,-0.075161,-0.024506
CHAS,1.51259,0.308877,0.0
NOX,-16.827781,-0.17382,-0.0
RM,3.736707,1.915684,2.52721
AGE,0.006566,0.012659,0.009518
DIS,-1.457818,-1.094026,-0.923285
RAD,0.336162,0.350916,0.30662
TAX,-0.01409,-0.017656,-0.016944


The table include coeffcients, test R square and mean square error for the three regression models above: ridge, OLS, and lasso. We can see OLS has the best R square for the dataset, followed by ridge and lasso. Even though it didnt work perfectly weel to demonstrate the penality function for overfitting issues. But you can see when comapring the result of lasso and ridge regression, for example. Lasso regression made several independ variables redundant by setting the coeffcients several variabvles to be zero. By doing so, it improved the R square value and decreased the SME and achieved a better result. From that, we can also call it a successful usage of model can imcrease the predictive power of a regression.