<a id = 'home' ></a>

# House Prices Analysis - Baseline

1. [**Libraries and Analyzing the Data**](#libraries) <a href = '#libraries'></a>
    - No null values
    - No variables with either Single Value or All Unique Values
2. [**Standardization and Dummies of Cat Vars**](#stddum) <a href = '#stddum'></a>
3. [**Model Building**](#model) <a href = '#model'></a>
    - [**A. Linear Regression**](#lg) <a href = '#lg'></a>
    - [**B. Random Forest**](#rf) <a href = '#rf'></a>

## 1. Libraries and Analyzing the Data <a id = 'libraries' ></a>

[Home](#home) <a href = '#home'></a>

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [3]:
house_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/kc_house_data/kc_house_data.csv')
house_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
test_new = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/kc_house_data/kc_house_new_test_data.csv')
test_new.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,5,3.25,5210,35765,2.5,0,4,5,10,4940,270,1911,0,98136,47.5463,-122.397,2590,10250
1,5,2.0,2800,17788,1.0,0,0,4,8,1400,1400,1963,0,98033,47.6719,-122.163,1760,18282
2,4,2.25,2060,44431,2.0,0,0,3,7,2060,0,1988,0,98077,47.744,-122.046,2160,45657
3,5,2.0,2360,19899,1.0,0,0,4,7,2360,0,1968,0,98010,47.3299,-122.046,1860,19998
4,4,2.5,2370,6557,2.0,0,0,3,9,2370,0,1998,0,98042,47.423,-122.155,2370,7378


#### Checking Data Types

In [5]:
house_data.dtypes

price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [6]:
test_new.dtypes

bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

#### Checking NAs

In [7]:
house_data.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [8]:
test_new.isna().sum()

bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

#### Checking column/s with Single Value or All Unique Values

[Home](#home) <a href = '#home'></a>

In [9]:
for i in house_data.columns:
    if house_data[i].nunique() == 1:
        print('With only 1 unique value: ', i)
    if house_data[i].nunique() == house_data.shape[0]:
        print('With all unique value: ', i)

In [10]:
for i in test_new.columns:
    if test_new[i].nunique() == 1:
        print('With only 1 unique value: ', i)
    if test_new[i].nunique() == test_new.shape[0]:
        print('With all unique value: ', i)

#### Checking column/s to change to Categorical

In [11]:
print('For Train')
d1 = house_data.nunique()
print(sorted(d1))
print("==============================")
print('For Test')
d2 = test_new.nunique()
print(sorted(d2))

For Train
[2, 5, 5, 6, 11, 13, 29, 70, 70, 116, 271, 621, 710, 729, 787, 3298, 4833, 7282, 8064]
For Test
[2, 5, 5, 6, 11, 11, 25, 50, 70, 116, 221, 533, 588, 660, 713, 2866, 3085, 3391]


In [13]:
col_train = house_data.columns
col_test = test_new.columns

In [16]:
l1 = []
for i in col_train:
    if house_data[i].nunique() <= 13:
        l1.append(i)

In [19]:
l2 = []
for i in col_test:
    if test_new[i].nunique() <= 13:
        l2.append(i)

In [20]:
# Checking the columns in train and test are same or not
df = pd.DataFrame(l1, columns = ['train'])
df['test'] = pd.DataFrame(l2)
df

Unnamed: 0,train,test
0,bedrooms,bedrooms
1,floors,floors
2,waterfront,waterfront
3,view,view
4,condition,condition
5,grade,grade


In [21]:
house_data[l1] = house_data[l1].apply(lambda x: x.astype('category'), axis=0)
test_new[l2] = test_new[l2].apply(lambda x: x.astype('category'), axis=0)
print('train dtypes:')
print(house_data[l1].dtypes)
print('======================================')
print('test dtypes:')
print(test_new[l1].dtypes)

train dtypes:
bedrooms      category
floors        category
waterfront    category
view          category
condition     category
grade         category
dtype: object
test dtypes:
bedrooms      category
floors        category
waterfront    category
view          category
condition     category
grade         category
dtype: object


## 2. Standardization and Dummies of Cat Vars <a id = 'stddum' ></a>

[Home](#home) <a href = '#home'></a>

In [24]:
l1

['bedrooms', 'floors', 'waterfront', 'view', 'condition', 'grade']

In [22]:
X_train = house_data.copy().drop('price', axis = 1)
y_train = house_data['price']

In [23]:
X_train_num = len(X_train)
combined_dataset = pd.concat(objs=[X_train, test_new], axis=0)

In [25]:
combined_dataset = pd.get_dummies(combined_dataset, columns=l1, drop_first=True)

In [26]:
import copy
X_train = copy.copy(combined_dataset[:X_train_num])
test = copy.copy(combined_dataset[X_train_num:])

In [27]:
print(X_train.shape)
print(test.shape)

(16613, 49)
(4999, 49)


In [31]:
X_train.columns

Index(['bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15',
       'sqft_lot15', 'bedrooms_1', 'bedrooms_2', 'bedrooms_3', 'bedrooms_4',
       'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 'bedrooms_8', 'bedrooms_9',
       'bedrooms_10', 'bedrooms_11', 'bedrooms_33', 'floors_1.5', 'floors_2.0',
       'floors_2.5', 'floors_3.0', 'floors_3.5', 'waterfront_1', 'view_1',
       'view_2', 'view_3', 'view_4', 'condition_2', 'condition_3',
       'condition_4', 'condition_5', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'grade_10', 'grade_11',
       'grade_12', 'grade_13'],
      dtype='object')

In [34]:
num_cols = ['bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement',
            'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15',
            'sqft_lot15']

In [35]:
trainscaled = X_train.copy()

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainscaled[num_cols])

trainscaled[num_cols]=scaler.transform(trainscaled[num_cols])

In [41]:
trainscaled.head()

Unnamed: 0,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,...,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,grade_13
0,-1.399765,-0.96622,-0.240449,-0.708559,-0.673747,-0.440202,-0.217128,1.843706,-0.349494,-0.300172,...,0,0,0,1,0,0,0,0,0,0
1,0.247653,0.570691,-0.202502,0.521641,0.213785,-0.583387,4.593975,0.864954,1.152253,-0.747333,...,0,0,0,1,0,0,0,0,0,0
2,-1.399765,-1.419554,-0.136762,-1.218036,-0.673747,-1.227719,-0.217128,-0.926349,1.273223,-0.127078,...,0,0,1,0,0,0,0,0,0,0
3,1.236104,-0.103781,-0.255942,-0.870101,1.345388,-0.082239,-0.217128,1.068091,-0.280778,-1.281041,...,0,0,0,1,0,0,0,0,0,0
4,-0.08183,-0.413375,-0.182527,-0.087246,-0.673747,0.705279,-0.217128,-0.076865,0.40639,1.228829,...,0,0,0,0,1,0,0,0,0,0


In [42]:
scaler.fit(test[num_cols])

test[num_cols]=scaler.transform(test[num_cols])
test.head()

Unnamed: 0,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,...,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,grade_13
0,1.228241,3.175358,0.575607,3.399999,0.045592,-2.361221,-0.184746,1.146458,-0.102774,-1.273226,...,0,0,0,0,0,0,1,0,0,0
1,-0.371045,0.660861,0.120795,-0.579346,2.788997,-0.663804,-0.184746,-0.861617,0.829635,0.310995,...,0,0,0,0,1,0,0,0,0,0
2,-0.051187,-0.111225,0.794853,0.162566,-0.609912,0.152263,-0.184746,-0.003799,1.364879,1.103105,...,0,0,0,1,0,0,0,0,0,0
3,-0.371045,0.201783,0.174203,0.499799,-0.609912,-0.50059,-0.184746,-1.310022,-1.709249,1.103105,...,0,0,0,1,0,0,0,0,0,0
4,0.26867,0.212216,-0.163345,0.51104,-0.609912,0.478689,-0.184746,-0.686154,-1.018108,0.365156,...,0,0,0,0,0,1,0,0,0,0


## 3. Model Building <a id = 'model' ></a>

[Home](#home) <a href = '#home'></a>

In [43]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state = 45)

In [44]:
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

### A. Linear Regression <a id = 'lg' ></a>

[Home](#home) <a href = '#home'></a>

In [45]:
# Model initialization
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [46]:
# Predict
y_train_predicted = LinReg.predict(X_train)

In [48]:
# model evaluation
mse = mean_squared_error(y_train, y_train_predicted)
r2 = r2_score(y_train, y_train_predicted)
rmse = math.sqrt(mse)
print('R-squared: ', r2)
print('MSE: ', mse)
print('RMSE: ', rmse)

R-squared:  0.7393823329420784
MSE:  34768159797.514
RMSE:  186462.22083176527


In [51]:
y_val_predicted = LinReg.predict(X_val)

In [52]:
# model evaluation
mse = mean_squared_error(y_val, y_val_predicted)
r2 = r2_score(y_val, y_val_predicted)
rmse = math.sqrt(mse)
print('R-squared: ', r2)
print('MSE: ', mse)
print('RMSE: ', rmse)

R-squared:  0.7050724393836697
MSE:  40249189031.27441
RMSE:  200622.00535154264


In [53]:
test['price'] = LinReg.predict(test)
test.head()

Unnamed: 0,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,...,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,grade_13,price
0,1.228241,3.175358,0.575607,3.399999,0.045592,-2.361221,-0.184746,1.146458,-0.102774,-1.273226,...,0,0,0,0,0,1,0,0,0,10767720.0
1,-0.371045,0.660861,0.120795,-0.579346,2.788997,-0.663804,-0.184746,-0.861617,0.829635,0.310995,...,0,0,0,1,0,0,0,0,0,10239580.0
2,-0.051187,-0.111225,0.794853,0.162566,-0.609912,0.152263,-0.184746,-0.003799,1.364879,1.103105,...,0,0,1,0,0,0,0,0,0,10346690.0
3,-0.371045,0.201783,0.174203,0.499799,-0.609912,-0.50059,-0.184746,-1.310022,-1.709249,1.103105,...,0,0,1,0,0,0,0,0,0,8541667.0
4,0.26867,0.212216,-0.163345,0.51104,-0.609912,0.478689,-0.184746,-0.686154,-1.018108,0.365156,...,0,0,0,0,1,0,0,0,0,9261431.0


In [54]:
submission = test[['price']]
submission.head()

Unnamed: 0,price
0,10767720.0
1,10239580.0
2,10346690.0
3,8541667.0
4,9261431.0


In [55]:
submission.to_csv("submission_linreg.csv",index=False)

### B. Random Forest <a id = 'rf' ></a>

[Home](#home) <a href = '#home'></a>

In [57]:
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor()
rf1.fit(X = X_train,y = y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [59]:
y_train_pred_rf = rf1.predict(X_train)

In [60]:
mse = mean_squared_error(y_train, y_train_pred_rf)
r2 = r2_score(y_train, y_train_pred_rf)
rmse = math.sqrt(mse)
print('R-squared: ', r2)
print('MSE: ', mse)
print('RMSE: ', rmse)

R-squared:  0.9685749995894111
MSE:  4192307636.8781724
RMSE:  64748.03191509509


In [61]:
y_val_pred_rf = rf1.predict(X_val)

In [62]:
mse = mean_squared_error(y_val, y_val_pred_rf)
r2 = r2_score(y_val, y_val_pred_rf)
rmse = math.sqrt(mse)
print('R-squared: ', r2)
print('MSE: ', mse)
print('RMSE: ', rmse)

R-squared:  0.8469421050296304
MSE:  20888031401.73616
RMSE:  144526.9227574439


In [64]:
test = test.drop('price', axis=1) # as 'price' it was added while predicting for Lin Reg
test['price'] = rf1.predict(test)
test.head()

Unnamed: 0,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,...,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,grade_13,price
0,1.228241,3.175358,0.575607,3.399999,0.045592,-2.361221,-0.184746,1.146458,-0.102774,-1.273226,...,0,0,0,0,0,1,0,0,0,308650.0
1,-0.371045,0.660861,0.120795,-0.579346,2.788997,-0.663804,-0.184746,-0.861617,0.829635,0.310995,...,0,0,0,1,0,0,0,0,0,305300.0
2,-0.051187,-0.111225,0.794853,0.162566,-0.609912,0.152263,-0.184746,-0.003799,1.364879,1.103105,...,0,0,1,0,0,0,0,0,0,310200.0
3,-0.371045,0.201783,0.174203,0.499799,-0.609912,-0.50059,-0.184746,-1.310022,-1.709249,1.103105,...,0,0,1,0,0,0,0,0,0,305300.0
4,0.26867,0.212216,-0.163345,0.51104,-0.609912,0.478689,-0.184746,-0.686154,-1.018108,0.365156,...,0,0,0,0,1,0,0,0,0,310200.0


In [65]:
submission_rf = test[['price']]
submission_rf.head()

Unnamed: 0,price
0,308650.0
1,305300.0
2,310200.0
3,305300.0
4,310200.0


In [66]:
submission_rf.to_csv("submission_rf.csv",index=False)