In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
import plotly.express as px

In [6]:
url=("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
df=pd.read_csv(url)
print(df.shape)
df.head()

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [7]:
df.corr()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
crim,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
zn,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
indus,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
chas,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
nox,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
rm,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
age,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
dis,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
rad,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
tax,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [None]:
# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - % lower status of the population
# MEDV - Median value of owner-occupied homes in $1000's

In [27]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

#find design matrix for regression model using 'rating' as response variable 
y, X = dmatrices('crim ~ df.iloc[:,1:]', data=df, return_type='dataframe')

#create DataFrame to hold VIF values
vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

#calculate VIF for each predictor variable 
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_df.index = df.iloc[:,:].columns
#view VIF for each predictor variable 
print(vif_df)

                   variable         VIF
crim              Intercept  638.781530
zn        df.iloc[:, 1:][0]    2.325094
indus     df.iloc[:, 1:][1]    3.987753
chas      df.iloc[:, 1:][2]    1.094326
nox       df.iloc[:, 1:][3]    4.551563
rm        df.iloc[:, 1:][4]    2.258113
age       df.iloc[:, 1:][5]    3.100801
dis       df.iloc[:, 1:][6]    4.289041
rad       df.iloc[:, 1:][7]    7.158834
tax       df.iloc[:, 1:][8]    9.195495
ptratio   df.iloc[:, 1:][9]    1.984489
b        df.iloc[:, 1:][10]    1.369741
lstat    df.iloc[:, 1:][11]    3.561476
medv     df.iloc[:, 1:][12]    3.772856


In [46]:
k=df.iloc[:,1:].columns
k
k=k.insert(0,"Intercept")
vif_df.index=k
vif_df[vif_df['VIF']<2]

Unnamed: 0,variable,VIF
chas,"df.iloc[:, 1:][2]",1.094326
ptratio,"df.iloc[:, 1:][9]",1.984489
b,"df.iloc[:, 1:][10]",1.369741


In [70]:
for i in vif_df.columns:
    print(i,vif_df[vif_df['VIF']<2])

variable                    variable       VIF
chas      df.iloc[:, 1:][2]  1.094326
ptratio   df.iloc[:, 1:][9]  1.984489
b        df.iloc[:, 1:][10]  1.369741
VIF                    variable       VIF
chas      df.iloc[:, 1:][2]  1.094326
ptratio   df.iloc[:, 1:][9]  1.984489
b        df.iloc[:, 1:][10]  1.369741
