# ------ House Price Prediction ------

In [1]:
#Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.pandas.set_option('display.max_columns', None)

In [2]:
#Importing datasets
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

I'll perform the following tasks
- Data Analysis
- Feature Engineering
- Feature Selection
- Model Building

***
## *** DATA ANALYSIS ***
***

In [3]:
#Missing values
features_nan = [ feature for feature in data.columns if data[feature].isnull().sum() > 1 ]
features_nan_test = [ feature for feature in test.columns if test[feature].isnull().sum() > 1 ]


#percentage of nan values
for feature in features_nan:
    print(feature, np.round(data[feature].isnull().mean(), 4), '% missing values')

LotFrontage 0.1774 % missing values
Alley 0.9377 % missing values
MasVnrType 0.0055 % missing values
MasVnrArea 0.0055 % missing values
BsmtQual 0.0253 % missing values
BsmtCond 0.0253 % missing values
BsmtExposure 0.026 % missing values
BsmtFinType1 0.0253 % missing values
BsmtFinType2 0.026 % missing values
FireplaceQu 0.4726 % missing values
GarageType 0.0555 % missing values
GarageYrBlt 0.0555 % missing values
GarageFinish 0.0555 % missing values
GarageQual 0.0555 % missing values
GarageCond 0.0555 % missing values
PoolQC 0.9952 % missing values
Fence 0.8075 % missing values
MiscFeature 0.963 % missing values


In [4]:
#List of Numerical features
numerical_features = [feature for feature in data.columns if data[feature].dtypes != 'O']
print('Total numerical features : ', len(numerical_features))
data[numerical_features].head()

Total numerical features :  38


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2003.0,2,548,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,1976.0,2,460,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2001.0,2,608,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,1998.0,3,642,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,2000.0,3,836,192,84,0,0,0,0,0,12,2008,250000


In [5]:
# list of Tmporal features
year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
year_feature

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [6]:
#List of variables that contain Discrete values
discrete_features=[feature for feature in numerical_features if data[feature].nunique()<25 and feature not in year_feature+['Id']]
print("Total discrete variables : ", len(discrete_features))
data[discrete_features].head()

Total discrete variables :  17


Unnamed: 0,MSSubClass,OverallQual,OverallCond,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,3SsnPorch,PoolArea,MiscVal,MoSold
0,60,7,5,0,1,0,2,1,3,1,8,0,2,0,0,0,2
1,20,6,8,0,0,1,2,0,3,1,6,1,2,0,0,0,5
2,60,7,5,0,1,0,2,1,3,1,6,1,2,0,0,0,9
3,70,7,5,0,1,0,1,0,3,1,7,1,3,0,0,0,2
4,60,8,5,0,1,0,2,1,4,1,9,1,3,0,0,0,12


In [7]:
#List of Continuous features
cont_features=[feature for feature in numerical_features if data[feature].nunique()>25 and feature not in year_feature+['Id']]
print("Total continuous variables : ", len(cont_features))
data[cont_features].head()

Total continuous variables :  16


Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SalePrice
0,65.0,8450,196.0,706,0,150,856,856,854,1710,548,0,61,0,0,208500
1,80.0,9600,0.0,978,0,284,1262,1262,0,1262,460,298,0,0,0,181500
2,68.0,11250,162.0,486,0,434,920,920,866,1786,608,0,42,0,0,223500
3,60.0,9550,0.0,216,0,540,756,961,756,1717,642,0,35,272,0,140000
4,84.0,14260,350.0,655,0,490,1145,1145,1053,2198,836,192,84,0,0,250000


In [8]:
#List of Categorical features
catg_features = [feature for feature in data.columns if data[feature].dtypes == 'O']
print("Total categorical variables : ", len(cont_features))
data[catg_features].head()

Total categorical variables :  16


Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal


***
## *** FEATURE ENGINEERING ***
***

### **`Handling Missing Values`**

In [9]:
#Replacing NaN values in CATEGORICAL features with new label 'Missing'
catg_features_nan = [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtypes == 'O']
catg_features_nan_test = [feature for feature in test.columns if test[feature].isnull().sum()>1 and test[feature].dtypes == 'O']

def replace_nan(dataset, feature):
    dataset[feature] = dataset[feature].fillna('Missing')
    
replace_nan(data, catg_features_nan)
replace_nan(test, catg_features_nan_test)

In [10]:
#Replacing NaN values in NUMERICAL features with median
num_features_nan = [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtypes != 'O']
num_features_nan_test = [feature for feature in test.columns if test[feature].isnull().sum()>1 and test[feature].dtypes != 'O']

#data
for feature in num_features_nan:
    median = data[feature].median()
    data[feature+'_nan'] = np.where(data[feature].isnull(), 1, 0)
    data[feature].fillna(median, inplace = True)
    
#test
for feature in num_features_nan_test:
    median = test[feature].median()
    test[feature+'_nan'] = np.where(test[feature].isnull(), 1, 0)
    test[feature].fillna(median, inplace = True)

### **`Temporal features`**

In [11]:
#Converting the 'year' value to 'total number of years'

for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    data[feature]=data['YrSold']-data[feature]
    test[feature]=test['YrSold']-test[feature]

### **`Log transformation of skewed continuous numerical features`**

In [12]:
for feature in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']:
    data[feature] = np.log(data[feature])
    
for feature in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']:
    test[feature] = np.log(test[feature])

### **`Handling Rare categories in categorical features`**

In [13]:
for feature in catg_features:
    temp=data.groupby(feature)['SalePrice'].count()/len(data)
    temp_df=temp[temp>0.01].index
    data[feature]=np.where(data[feature].isin(temp_df),data[feature],'rare_variables')
    test[feature]=np.where(test[feature].isin(temp_df),test[feature],'rare_variables')

### **`Encoding categorical features`**

In [14]:
for feature in catg_features:
    labels_ordered=data.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    data[feature]=data[feature].map(labels_ordered)
    test[feature]=test[feature].map(labels_ordered)

***
## *** FEATURE SCALING ***
***

In [15]:
features_to_scale=[feature for feature in data.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(data[features_to_scale])
scaler.fit(test[features_to_scale])

MinMaxScaler()

In [16]:
data = pd.concat([data[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(data[features_to_scale]), columns=features_to_scale)],
                    axis=1)
test = pd.concat([test[['Id']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(test[features_to_scale]), columns=features_to_scale)],
                    axis=1)

***
## *** Model building and training ***
***

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

X = data.drop(['Id', 'SalePrice'], axis = 1)
y = data['SalePrice']

for feature in test.columns:
    test[feature].fillna(test[feature].mean(), inplace = True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = RandomForestRegressor()
clf.fit(X_train, y_train)

RandomForestRegressor()

In [19]:
#Prediction
y_pred = clf.predict(X_test)

accuracy =  r2_score(y_pred, y_test)
accuracy

0.8273459126308822

In [20]:
#Submission
test_pred = clf.predict(test.drop('Id', axis = 1))
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': test_pred})
submission.to_csv('submission.csv', index=False)