# Question 1: House Price Prediction


## Team Members:

#### Marcel Santos de Carvalho, id 79083
#### Loris Baudry, id 79794
#### Alex Palacios, id 73713

#### Responsible for this notebook: Loris Baudry

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 1. Loading and Preview of the Data Frame

## 1.1. Loading 

In [2]:
# We read the file into the data frame
df = pd.read_csv('housing/train.csv')

## 1.2. Preview of the Data Frame

In [3]:
# Have a preview of the first 10 rows
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Find out how many data rows and columns we have
df.shape

(1460, 81)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# 2. Clean and Prepare data for model

## 2.1. Drop Columns with lots of nulls 

In [6]:
df.drop(['Id'],axis=1,inplace=True)

In [7]:
# Drop of columns with lots of nulls 
df.drop(['PoolQC','Fence','MiscFeature', 'Alley', 'GarageYrBlt' ],axis=1,inplace=True)

## 2.2. Filling Missing Values

In [8]:
# filling missing categorical columns with mode 
df['BsmtCond']=df['BsmtCond'].fillna(df['BsmtCond'].mode()[0])
df['BsmtQual']=df['BsmtQual'].fillna(df['BsmtQual'].mode()[0])
df['FireplaceQu']=df['FireplaceQu'].fillna(df['FireplaceQu'].mode()[0])
df['GarageType']=df['GarageType'].fillna(df['GarageType'].mode()[0])
df['GarageFinish']=df['GarageFinish'].fillna(df['GarageFinish'].mode()[0])
df['GarageQual']=df['GarageQual'].fillna(df['GarageQual'].mode()[0])
df['GarageCond']=df['GarageCond'].fillna(df['GarageCond'].mode()[0])
df['MasVnrType']=df['MasVnrType'].fillna(df['MasVnrType'].mode()[0])
df['MasVnrArea']=df['MasVnrArea'].fillna(df['MasVnrArea'].mode()[0])
df['BsmtExposure']=df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0])
df['BsmtFinType2']=df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0])

In [9]:
# filling missing continuous columns with mean
df['LotFrontage']=df['LotFrontage'].fillna(df['LotFrontage'].mean())

## 2.3. Remove Remaining Nulls

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(1422, 75)

## 2.4. Handle Categorical Data

In [12]:
df_text = df.select_dtypes(include=object)
one_hot = pd.get_dummies(df_text)
df_join = df.join(one_hot)
df_join.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,1,0,0,0,0,1,0
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,1,0,0,0,0,1,0
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,1,0,0,0,0,1,0
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,1,1,0,0,0,0,0
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,1,0,0,0,0,1,0


In [13]:
df = df_join.select_dtypes(exclude=['object'])
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [14]:
df.shape

(1422, 271)

# 3. Select Relevant Features to build the model

## 3.1. Analysis of Features correlation with SalePrice

In [15]:
# We delete features having a correlation > 80% with another feature

corr = df.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.8:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]
df = df[selected_columns]

In [16]:
# Analysis of features correlations with SalePrice and selection of features with a correlation > 0.40 
corr_matrix = df.corr()
corr_analysis = corr_matrix["SalePrice"].sort_values(ascending=False)
relevant_features = corr_analysis[corr_analysis>0.40]
relevant_features

SalePrice               1.000000
OverallQual             0.787985
GrLivArea               0.709303
GarageCars              0.643039
TotalBsmtSF             0.610085
FullBath                0.573755
BsmtQual_Ex             0.553768
YearBuilt               0.519014
KitchenQual_Ex          0.504655
YearRemodAdd            0.500512
Foundation_PConc        0.492300
MasVnrArea              0.470117
Fireplaces              0.461108
ExterQual_Ex            0.452715
ExterQual_Gd            0.443755
HeatingQC_Ex            0.428411
BsmtFinType1_GLQ        0.426911
GarageFinish_Fin        0.417049
Neighborhood_NridgHt    0.402007
Name: SalePrice, dtype: float64

In [None]:
# How does the correlation matrix looks like
corr = df[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'Foundation_PConc', 'KitchenQual_Ex', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'HeatingQC_Ex', 'BsmtFinType1_GLQ', 'GarageFinish_Fin', 'Neighborhood_NridgHt']]
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')

<AxesSubplot:>

In [None]:
X_corr = df[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'Foundation_PConc', 'KitchenQual_Ex', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'HeatingQC_Ex', 'BsmtFinType1_GLQ', 'GarageFinish_Fin', 'Neighborhood_NridgHt']]

## 3.2. Analysis of Features P-values

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

X = X_corr
y = df['SalePrice']

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

## 3.3. Result - Selected Features

In [None]:
# Selected Features = 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'KitchenQual_Ex', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'BsmtFinType1_GLQ']]

# 4. Visualise the data and look for outliers to remove

## 4.1. Label

In [None]:
# Study of the distribution of the column we want to predict: SalePrice
plt.figure(figsize=(8, 6))
plt.hist(df.SalePrice,bins=50);
plt.xlabel('Sales Prices')
plt.ylabel('Frequency');

In [None]:
# we drop outliers
df = df[df.SalePrice<600000]

In [None]:
df['SalePrice'].describe()

## 4.2. Features

In [None]:
df[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'BsmtFinType1_GLQ']].describe()

In [None]:
# Histograms of Selected Features
df[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'BsmtFinType1_GLQ']].hist(figsize=(22, 24));

In [None]:
def plotFeatureName(featureName):
    plt.figure(figsize=(8,6))
    plt.scatter(df[featureName], df['SalePrice'])  
    plt.title(featureName)
    plt.xlabel(featureName)
    plt.ylabel('price')

In [None]:
plotFeatureName('OverallQual')

In [None]:
plotFeatureName('GrLivArea')

In [None]:
#Remove Outliers
df = df[df['GrLivArea']<4000]

In [None]:
plotFeatureName('GarageCars')

In [None]:
plotFeatureName('TotalBsmtSF')

In [None]:
#Remove Outliers
df = df[df['TotalBsmtSF']<3000]

In [None]:
plotFeatureName('MasVnrArea')

In [None]:
#Remove Outliers
df = df[df['MasVnrArea']<1400]

In [None]:
plotFeatureName('YearRemodAdd')

In [None]:
plotFeatureName('YearBuilt')

In [None]:
sns.catplot(x="BsmtQual_Ex", y="SalePrice", jitter=False, data=df)

In [None]:
sns.catplot(x="KitchenQual_Ex", y="SalePrice", jitter=False, data=df)

In [None]:
sns.catplot(x="Fireplaces", y="SalePrice", jitter=False, data=df)

In [None]:
sns.catplot(x="ExterQual_Gd", y="SalePrice", jitter=False, data=df)

In [None]:
sns.catplot(x="ExterQual_Ex", y="SalePrice", jitter=False, data=df)

In [None]:
sns.catplot(x="BsmtFinType1_GLQ", y="SalePrice", jitter=False, data=df)

# 5. Linear Regression Model

In [None]:
X = df[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'BsmtQual_Ex', 'YearBuilt', 'KitchenQual_Ex', 'YearRemodAdd', 'ExterQual_Gd', 'Fireplaces', 'MasVnrArea', 'ExterQual_Ex', 'BsmtFinType1_GLQ']]

In [None]:
y = df['SalePrice']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state = 999)

print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

In [None]:
# Load linear regression model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
# Fit to the training data
lm.fit(X_train, y_train)

In [None]:
print(pd.Series(lm.coef_, index = X.columns))

In [None]:
lm.intercept_

In [None]:
y_pred = lm.predict(X_test)

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred)
plt.xlabel("True Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.plot([0, 50], [0, 50], '--k')

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# How well did it do on the training set
y_pred = lm.predict(X_train)

print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(y_train, y_pred)))
print('R squared: %.2f' % r2_score(y_train, y_pred))

In [None]:
# See how well it does on the test set
y_pred = lm.predict(X_test)

print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
print('R squared: %.2f' % r2_score(y_test, y_pred))

# 6. Ridge & Lasso Regression

## 6.1. Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
#Ridge regression coefficients
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(X_train, y_train)
pred_train_ridge_reg= ridge_reg.predict(X_train)
print(pd.Series(ridge_reg.fit(X_train, y_train).coef_, index = X.columns))

In [None]:
# How well did it do on the training set
print("Mean squared error - Training set: %.2f" % np.sqrt(mean_squared_error(y_train, pred_train_ridge_reg)))
print('R squared - Training set: %.2f' % r2_score(y_train, pred_train_ridge_reg))

In [None]:
# How well it does on the test set
pred_test_ridge_reg= ridge_reg.predict(X_test)
print("Mean squared error - Test set:: %.2f" % np.sqrt(mean_squared_error(y_test, pred_test_ridge_reg)))
print('R squared - Test set: %.2f' % r2_score(y_test, pred_test_ridge_reg))

## 6.2. Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
#Lasso regression coefficients
lasso_reg = Lasso(alpha=0.01)
lasso_reg.fit(X_train, y_train)
pred_train_lasso= lasso_reg.predict(X_train)
print(pd.Series(lasso_reg.fit(X_train, y_train).coef_, index = X.columns))

In [None]:
# How well did it do on the training set
print("Mean squared error - Training set: %.2f" % np.sqrt(mean_squared_error(y_train, pred_train_lasso)))
print('R squared - Training set: %.2f' % r2_score(y_train, pred_train_lasso))

In [None]:
# How well it does on the test set
pred_test_lasso= lasso_reg.predict(X_test)
print("Mean squared error - Test set:: %.2f" % np.sqrt(mean_squared_error(y_test, pred_test_lasso)))
print('R squared - Test set: %.2f' % r2_score(y_test, pred_test_lasso))