In [1]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

  from pandas import Int64Index as NumericIndex


In [2]:
import load
data = load.boston()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('PRICE', axis=1), np.log(data['PRICE']), test_size=0.2, random_state=10)
x_incl_const = sm.add_constant(X_train)

model = sm.OLS(y_train, x_incl_const)
results = model.fit()

In [4]:
org_coef = pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})
org_coef

Unnamed: 0,coef,p-value
const,4.059944,0.0
CRIM,-0.010672,0.0
ZN,0.001579,0.009
INDUS,0.00203,0.445
CHAS,0.080331,0.038
NOX,-0.704068,0.0
RM,0.073404,0.0
AGE,0.000763,0.209
DIS,-0.047633,0.0
RAD,0.014565,0.0


In [5]:
results.rsquared, results.bic

(0.7930234826697584, -139.74997769478898)

In [6]:
# Reduced model #1 excluding INDUS
x_incl_const = sm.add_constant(X_train)
x_incl_const = x_incl_const.drop(["INDUS"], axis=1)

model = sm.OLS(y_train, x_incl_const)
results = model.fit()

coef_minus_indus = pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})

results.rsquared, results.bic

(0.7927126289415162, -145.1450885559114)

In [7]:
# Reduced model #2 excluding INDUS and AGE
x_incl_const = sm.add_constant(X_train)
x_incl_const = x_incl_const.drop(["INDUS", "AGE"], axis=1)

model = sm.OLS(y_train, x_incl_const)
results = model.fit()

reduced_coef = pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})

results.rsquared, results.bic # Smaller bic is better

(0.7918657661852815, -149.49934294224656)

In [8]:
frames = [org_coef, coef_minus_indus, reduced_coef]
pd.concat(frames, axis=1)

Unnamed: 0,coef,p-value,coef.1,p-value.1,coef.2,p-value.2
const,4.059944,0.0,4.056231,0.0,4.035922,0.0
CRIM,-0.010672,0.0,-0.010721,0.0,-0.010702,0.0
ZN,0.001579,0.009,0.001551,0.01,0.001461,0.014
INDUS,0.00203,0.445,,,,
CHAS,0.080331,0.038,0.082795,0.032,0.086449,0.025
NOX,-0.704068,0.0,-0.673365,0.0,-0.616448,0.0
RM,0.073404,0.0,0.071739,0.0,0.076133,0.0
AGE,0.000763,0.209,0.000766,0.207,,
DIS,-0.047633,0.0,-0.049394,0.0,-0.052692,0.0
RAD,0.014565,0.0,0.014014,0.0,0.013743,0.0
