In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
house_df = pd.read_csv('data/kc_house_data.csv')
to_drop = ['date', 'view','sqft_above','sqft_basement','yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15','id']
house_copy = house_df.copy()
house_copy.drop(to_drop, axis=1, inplace=True)
house_copy

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built
0,221900.0,3,1.00,1180,5650,1.0,,Average,7 Average,1955
1,538000.0,3,2.25,2570,7242,2.0,NO,Average,7 Average,1951
2,180000.0,2,1.00,770,10000,1.0,NO,Average,6 Low Average,1933
3,604000.0,4,3.00,1960,5000,1.0,NO,Very Good,7 Average,1965
4,510000.0,3,2.00,1680,8080,1.0,NO,Average,8 Good,1987
...,...,...,...,...,...,...,...,...,...,...
21592,360000.0,3,2.50,1530,1131,3.0,NO,Average,8 Good,2009
21593,400000.0,4,2.50,2310,5813,2.0,NO,Average,8 Good,2014
21594,402101.0,2,0.75,1020,1350,2.0,NO,Average,7 Average,2009
21595,400000.0,3,2.50,1600,2388,2.0,,Average,8 Good,2004


## Checking for missing values in the data

In [3]:
house_copy.isna().sum()

price             0
bedrooms          0
bathrooms         0
sqft_living       0
sqft_lot          0
floors            0
waterfront     2376
condition         0
grade             0
yr_built          0
dtype: int64

Checking to the data type of waterfront and checkiing if it is categorical

In [4]:
house_copy['waterfront'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 21597 entries, 0 to 21596
Series name: waterfront
Non-Null Count  Dtype 
--------------  ----- 
19221 non-null  object
dtypes: object(1)
memory usage: 168.9+ KB


In [5]:
house_copy['waterfront'].value_counts()

waterfront
NO     19075
YES      146
Name: count, dtype: int64

Dropping the missing values in waterfront because the column is categorical

In [7]:
house_copy.dropna(subset=['waterfront'], axis=0, inplace=True)
house_copy

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,yr_built
1,538000.0,3,2.25,2570,7242,2.0,NO,Average,7 Average,1951
2,180000.0,2,1.00,770,10000,1.0,NO,Average,6 Low Average,1933
3,604000.0,4,3.00,1960,5000,1.0,NO,Very Good,7 Average,1965
4,510000.0,3,2.00,1680,8080,1.0,NO,Average,8 Good,1987
5,1230000.0,4,4.50,5420,101930,1.0,NO,Average,11 Excellent,2001
...,...,...,...,...,...,...,...,...,...,...
21591,475000.0,3,2.50,1310,1294,2.0,NO,Average,8 Good,2008
21592,360000.0,3,2.50,1530,1131,3.0,NO,Average,8 Good,2009
21593,400000.0,4,2.50,2310,5813,2.0,NO,Average,8 Good,2014
21594,402101.0,2,0.75,1020,1350,2.0,NO,Average,7 Average,2009


Checking if all missing values have been removed

In [8]:
house_copy.isna().sum()

price          0
bedrooms       0
bathrooms      0
sqft_living    0
sqft_lot       0
floors         0
waterfront     0
condition      0
grade          0
yr_built       0
dtype: int64

Creating dummies for one hot encoding 

In [9]:
dummy_list = ['waterfront', 'condition', 'grade']
dummies = pd.get_dummies(house_copy[dummy_list], drop_first=True).astype(int)
house_copy = pd.concat([house_copy, dummies], axis=1)
house_copy.drop(dummy_list, axis=1, inplace=True)
house_copy

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,yr_built,waterfront_YES,condition_Fair,condition_Good,...,grade_11 Excellent,grade_12 Luxury,grade_13 Mansion,grade_3 Poor,grade_4 Low,grade_5 Fair,grade_6 Low Average,grade_7 Average,grade_8 Good,grade_9 Better
1,538000.0,3,2.25,2570,7242,2.0,1951,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,180000.0,2,1.00,770,10000,1.0,1933,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,604000.0,4,3.00,1960,5000,1.0,1965,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,510000.0,3,2.00,1680,8080,1.0,1987,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1230000.0,4,4.50,5420,101930,1.0,2001,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21591,475000.0,3,2.50,1310,1294,2.0,2008,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21592,360000.0,3,2.50,1530,1131,3.0,2009,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21593,400000.0,4,2.50,2310,5813,2.0,2014,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21594,402101.0,2,0.75,1020,1350,2.0,2009,0,0,0,...,0,0,0,0,0,0,0,1,0,0
