In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# Read the file
url = "house_data.csv"
df = pd.read_csv(url, header=0, names=['Rooms', 'Age', 'Distance', 'Accessibility', 'Tax', 'DisadvantagedPosition', 'Crime', 'NiticOxides', 'PupilTeacher', 'Residential', 'NonRetail', 'Price'])

In [3]:
df.head() # prints the first five rows

Unnamed: 0,Rooms,Age,Distance,Accessibility,Tax,DisadvantagedPosition,Crime,NiticOxides,PupilTeacher,Residential,NonRetail,Price
0,5.565,70.6,2.0635,24,666,17.16,8.79212,0.584,20.2,0.0,18.1,11.7
1,6.879,77.7,3.2721,8,307,9.93,0.62356,0.507,17.4,0.0,6.2,27.5
2,5.972,76.7,3.1025,4,304,9.97,0.3494,0.544,18.4,0.0,9.9,20.3
3,6.943,97.4,1.8773,5,403,4.59,1.22358,0.605,14.7,0.0,19.58,41.3
4,5.926,71.0,2.9084,24,666,18.13,15.5757,0.58,20.2,0.0,18.1,19.1


In [4]:
# Saved the independent variables in x and dependent variable in y.
features = ['Rooms', 'Age', 'Distance', 'Accessibility', 'Tax', 'DisadvantagedPosition', 'Crime', 'NiticOxides', 'PupilTeacher', 'Residential', 'NonRetail']
x = df[features].values
y = df['Price'].values

In [5]:
print(x.shape)
print(y.shape)

(399, 11)
(399,)


In [6]:
# Standardized the data (x and y) using the StandardScaler().fit_transform function
x_Stand = StandardScaler().fit_transform(x)
y_Stand = StandardScaler().fit_transform(y.reshape(-1, 1))

In [7]:
# Split the data into train and test datasets with test size = 20%
X_train, X_test, y_train, y_test = train_test_split(x_Stand, y_Stand, test_size=0.2)
y_train = y_train.reshape(-1) # used to flatten the array elements to a 1D array.
y_test = y_test.reshape(-1)
print (X_train.shape)
print(y_train.shape)
print (X_test.shape)
print( y_test.shape)

(319, 11)
(319,)
(80, 11)
(80,)


In [8]:
# Apply LR model on xTrain, yTrain, print the r_squared, the intercept, and the slopes.
model = LinearRegression() # this is the OLS model
model.fit(X_train, y_train)
r_sq = model.score(X_train, y_train)
print('R squared:', r_sq)

print('intercept:', model.intercept_)

print('slope:', model.coef_)

R squared: 0.7439613482940031
intercept: -0.0017199218271623453
slope: [ 0.29013389  0.01719818 -0.28756635  0.26218168 -0.27272063 -0.48204995
 -0.11519076 -0.17732149 -0.21232217  0.11084346  0.06342839]


 -R squared 0.7104 means that 71% of the variance in the dependent variable is explained by the independent variables in the model. this is a reasonably good fit because it is closer to 1.
 -Intercept -0.011 explains that when all independent variables equal zero, the dependent variable also tend towards zero.
 -Coefficients - Rooms with positive coefficent 0.2535 shows the most substantial influence on house price predictions in this model. Smaller variables are less likely to be statistically significant.

In [9]:
# Use the model to predict the output for the test data set (xTest),
# then find the error (MSE) and r^2
y_pred = model.predict(X_test)
print('R squared:', model.score(X_test, y_test))

y_pred= y_pred.reshape(-1)
e = y_test - y_pred

print("MSE = ",sum(e**2)/83)

R squared: 0.6921639660121743
MSE =  0.2920878470747855


The MSE tells us the average squared difference between the predicted and actual value of the dependent variable. The lower the better and this MSE shows a reasonably good fit, it indicates a relatively small errors on the average.

In [10]:
# summarize results
from mlxtend.evaluate import bias_variance_decomp

mse, bias, var = bias_variance_decomp(model, X_train, y_train, X_test, y_test, loss='mse')
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.320
Bias: 0.307
Variance: 0.013


Multicollinearity occurs when two or more independent variable in a dataframe have a high correlation with one another in a regression model.This can be examined by calculating the correlation matrix of coefficients.

High correlation of coefficients suggest multicollinearity which can affect the interpretablity and stability of the regression model.


In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [12]:

# Calculate VIF for each variable
def calc_vif(X):
    vif = pd.DataFrame()
    vif["Variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return(vif)
X = df.iloc[:,:-1]
calc_vif(X)


# Display the results


Unnamed: 0,Variables,VIF
0,Rooms,73.140393
1,Age,20.530213
2,Distance,14.100498
3,Accessibility,13.804679
4,Tax,55.873128
5,DisadvantagedPosition,10.773437
6,Crime,2.083883
7,NiticOxides,72.374336
8,PupilTeacher,76.705813
9,Residential,2.834652


.Multicollinearity is best assessed by calculating the Variance Inflation Factor (VIF) for each coefficient. 
.VIF measures the extent of multicollinearity in the model. 
.VIF less than 5 shows moderate correlation which is generally acceptable.
.VIF greater than 5 shows high correlation which indicates potential multicollinearity. 
.The high VIF's suggest that the variables are highly correlated with eachother in the model, potentially causing multicollinearity issues.
.Some of the effects of multicollinearity is that the coefficients become less reliable and might change dramatically with small changes in the data. Also, it becomes extremely difficult to interpret the individual effects of each variable due to the strong interdependence.