In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats

In [2]:
cd ..

/Users/alphonsowoodbury/DS/mod2project/flatiron_mod2_project_kch


In [26]:
#importing data
df = pd.read_csv('data/kc_house_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
id               21597 non-null int64
date             21597 non-null object
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       19221 non-null float64
view             21534 non-null float64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null object
yr_built         21597 non-null int64
yr_renovated     17755 non-null float64
zipcode          21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
dtypes: float64(8), int64(11), object(2)
memory usage: 3.5+ MB


In [27]:
#data cleaning:
#changing question marks to 0.0
df = df.replace('?', 0.0)
df.view = df.view.replace(np.nan, 0)
#changing all column object types to floats (except date column)
df.loc[:, df.columns != 'date'] = df.loc[:,df.columns != 'date'].astype('float')

#changing all 0.0 in sqft_basement column, yr_renovated, and waterfront columns to NaN values
df['sqft_basement'] = df['sqft_basement'].replace(0.0 , np.nan)
df['waterfront'] = df.waterfront.replace(0.0, np.nan)
df['yr_renovated'] =df['yr_renovated'].replace(0.0, np.nan)
#changing date column to datetime values
df['date'] = pd.to_datetime(df['date'])
df = df.drop(['id'],axis=1)

In [28]:
#Feature Engineering:
#creating has_basement, has_waterfront, has_renovation columns with True/False values
df['has_waterfront'] = df['waterfront'].notnull().astype('int64')
df['has_basement'] = df['sqft_basement'].notnull().astype('int64')
df['has_renovation'] = df['yr_renovated'].notnull().astype('int64')
#creating eff_built column (which updates built year depending on whether it was renovated or not)
df.loc[df['yr_renovated'].notnull(), 'eff_built'] = 2020 - df['yr_renovated']
df.loc[df['yr_renovated'].isnull(), 'eff_built'] = 2020 -df['yr_built']
df.eff_built = df.eff_built.astype('int64')

#correcting data types
##discrete vars as int
df.bedrooms = df.bedrooms.astype('int64')
df.bathrooms = df.bathrooms.astype('int64')
df.floors = df.floors.astype('int64')
df.zipcode = df.zipcode.astype('int64')
df.condition = df.condition.astype('int64')
df.grade = df.grade.astype('int64')
df.view = df.view.astype('int64')

#categoricals as obj
df.zipcode = df.zipcode.astype('object')

#drop pre-processed columns
df = df.drop(['date','waterfront','sqft_above','sqft_basement','yr_built','yr_renovated'],axis=1)


In [29]:
df.to_csv('kc_cleaned.csv',index=False)
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,zipcode,lat,long,sqft_living15,sqft_lot15,has_waterfront,has_basement,has_renovation,eff_built
0,221900.0,3,1,1180.0,5650.0,1,0,3,7,98178,47.5112,-122.257,1340.0,5650.0,0,0,0,65
1,538000.0,3,2,2570.0,7242.0,2,0,3,7,98125,47.7210,-122.319,1690.0,7639.0,0,1,1,29
2,180000.0,2,1,770.0,10000.0,1,0,3,6,98028,47.7379,-122.233,2720.0,8062.0,0,0,0,87
3,604000.0,4,3,1960.0,5000.0,1,0,5,7,98136,47.5208,-122.393,1360.0,5000.0,0,1,0,55
4,510000.0,3,2,1680.0,8080.0,1,0,3,8,98074,47.6168,-122.045,1800.0,7503.0,0,0,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,360000.0,3,2,1530.0,1131.0,3,0,3,8,98103,47.6993,-122.346,1530.0,1509.0,0,0,0,11
21593,400000.0,4,2,2310.0,5813.0,2,0,3,8,98146,47.5107,-122.362,1830.0,7200.0,0,0,0,6
21594,402101.0,2,0,1020.0,1350.0,2,0,3,7,98144,47.5944,-122.299,1020.0,2007.0,0,0,0,11
21595,400000.0,3,2,1600.0,2388.0,2,0,3,8,98027,47.5345,-122.069,1410.0,1287.0,0,0,0,16


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 18 columns):
price             21597 non-null float64
bedrooms          21597 non-null int64
bathrooms         21597 non-null int64
sqft_living       21597 non-null float64
sqft_lot          21597 non-null float64
floors            21597 non-null int64
view              21597 non-null int64
condition         21597 non-null int64
grade             21597 non-null int64
zipcode           21597 non-null object
lat               21597 non-null float64
long              21597 non-null float64
sqft_living15     21597 non-null float64
sqft_lot15        21597 non-null float64
has_waterfront    21597 non-null int64
has_basement      21597 non-null int64
has_renovation    21597 non-null int64
eff_built         21597 non-null int64
dtypes: float64(7), int64(10), object(1)
memory usage: 3.0+ MB
