In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from __future__ import division
%matplotlib inline
from __future__ import print_function

from sklearn.linear_model import LinearRegression

In [2]:
# Let's load our dataset
data = pd.read_csv('house_prices_multivariate.csv')
data.head(5)

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,SalePrice
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,548,0,61,0,0,0,0,0,2008,208500
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,460,298,0,0,0,0,0,0,2007,181500
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,608,0,42,0,0,0,0,0,2008,223500
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,642,0,35,272,0,0,0,0,2006,140000
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,836,192,84,0,0,0,0,0,2008,250000


In [3]:
# For Multivariate regression, we make independent and dependent variables from the dataset.
X = data.iloc[:,:-1] # Selects all columns except the last one
y = data.SalePrice # Dependent variable

In [4]:
# Let's now try fitting the multiple linear regression.
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [5]:
print("intercept:", regressor.intercept_) # This is the y-intercept 
print("coefficients of predictors:", regressor.coef_) # These are the regression coefficients.

intercept: 310649.2600888717
coefficients of predictors: [ 4.21581098e+01  4.41367617e-01  1.77089455e+04  5.84597164e+03
  3.59658315e+02  1.19385237e+02  2.59435150e+01  9.76748897e+00
  7.65860421e-01 -6.61329855e-01  9.87201953e+00  1.96567898e+01
  1.31846867e+01 -4.66155080e+00  2.81799257e+01  6.78157998e+03
  3.36169287e+02  1.40344800e+03 -2.93573021e+03 -8.64074712e+03
 -3.35073713e+04  6.10172168e+03  3.20869122e+03 -8.23684306e+01
  1.56189970e+04  9.59392447e+00  2.51559075e+01  5.60981357e-01
  1.07712460e+01  2.51081902e+01  5.36124522e+01 -4.13099007e+01
 -8.16461371e-02 -5.83097021e+02]


In [8]:
# Let's now try predicting the price of a house
house = X.iloc[154]
house

LotFrontage        62.0
LotArea          8244.0
OverallQual         7.0
OverallCond         5.0
YearBuilt        2004.0
YearRemodAdd     2004.0
MasVnrArea          0.0
BsmtFinSF1          0.0
BsmtFinSF2          0.0
BsmtUnfSF         840.0
TotalBsmtSF       840.0
1stFlrSF          840.0
2ndFlrSF          880.0
LowQualFinSF        0.0
GrLivArea        1720.0
BsmtFullBath        0.0
BsmtHalfBath        0.0
FullBath            2.0
HalfBath            1.0
BedroomAbvGr        3.0
KitchenAbvGr        1.0
TotRmsAbvGrd        7.0
Fireplaces          1.0
GarageYrBlt      2004.0
GarageCars          2.0
GarageArea        440.0
WoodDeckSF        100.0
OpenPorchSF        48.0
EnclosedPorch       0.0
3SsnPorch           0.0
ScreenPorch         0.0
PoolArea            0.0
MiscVal             0.0
YrSold           2007.0
Name: 154, dtype: float64

In [10]:
pred_house = regressor.predict(house.values.reshape(1, -1))
print("Predicted Value:", pred_house[0])

Predicted Value: 203440.2108274703


In [11]:
print("Actual Value:", y[154])

Actual Value: 183500


### We can clearly see that the predicted value is not so far away from the actual value.

In [12]:
# Let's now predict the price of all the houses in the dataset
y_pred = regressor.predict(X)
y_pred[:20]

array([223165.24462329, 193708.14702761, 216394.79759076, 197356.62505514,
       295125.75398644, 172516.96207706, 269477.13355181, 245198.81455231,
       168787.92247658,  87185.78920276, 112270.92560438, 340383.76995369,
       120881.24238997, 239378.44099012, 151390.71255143, 152303.81559092,
       167083.7976024 ,  82345.43071373, 150868.09223021, 122452.35658094])

In [13]:
prices = pd.DataFrame({"Actual": y, "Predicted":y_pred})
prices.head(10)

Unnamed: 0,Actual,Predicted
0,208500,223165.244623
1,181500,193708.147028
2,223500,216394.797591
3,140000,197356.625055
4,250000,295125.753986
5,143000,172516.962077
6,307000,269477.133552
7,200000,245198.814552
8,129900,168787.922477
9,118000,87185.789203


In [14]:
# Now we will try measuring the goodness of fit
from sklearn.metrics import mean_squared_error

mean_squared_error(y_pred, y)

1219044781.4947433