In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('kc_house_data.csv')

# Column Names and descriptions for Kings County Data Set
* **id** - unique identified for a house
* **dateDate** - house was sold
* **pricePrice** -  is prediction target
* **bedroomsNumber** -  of Bedrooms/House
* **bathroomsNumber** -  of bathrooms/bedrooms
* **sqft_livingsquare** -  footage of the home
* **sqft_lotsquare** -  footage of the lot
* **floorsTotal** -  floors (levels) in house
* **waterfront** - House which has a view to a waterfront
* **view** - Has been viewed
* **condition** - How good the condition is ( Overall )
* **grade** - overall grade given to the housing unit, based on King County grading system
* **sqft_above** - square footage of house apart from basement
* **sqft_basement** - square footage of the basement
* **yr_built** - Built Year
* **yr_renovated** - Year when house was renovated
* **zipcode** - zip
* **lat** - Latitude coordinate
* **long** - Longitude coordinate
* **sqft_living15** - The square footage of interior housing living space for the nearest 15 neighbors
* **sqft_lot15** - The square footage of the land lots of the nearest 15 neighbors


# Initial assumptions
* **id** - drop
* **dateDate** - house prices could be seasonal, keep for now
* **pricePrice** - this is the  **target**
* **bedroomsNumber** -  check for outliers, possibly make categorical
* **bathroomsNumber** -  check for outliers, possibly make categorical
* **sqft_livingsquare** -  numerical, check stats
* **sqft_lotsquare** -  numerical, check stats
* **floorsTotal** -  check for outliers, possibly make categorical, does this matter?
* **waterfront** - probably high value, but few houses
* **view** - Has been viewed, does this matter
* **condition** - Categorical
* **grade** - Categorical
* **sqft_above** - closely related to sqft_living space
* **sqft_basement** - square footage of the basement
* **yr_built** - may bin these years, new houses are desirable, so are established neighborhoods
* **yr_renovated** - new column for recent or not
* **zipcode** - zip - location, probably important
* **lat** - location - probably important
* **long** - location - probably important
* **sqft_living15** - if this can identify housing comps it is very relevant, else drop
* **sqft_lot15** - if this can identify housing comps it is very relevant, else drop


In [3]:
df.shape
# running stats. 21,000 rows, 21 columns

(21597, 21)

In [4]:
# quick preview of the data. Already see some NaN
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [5]:
# check end of the data too.
df.tail()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
21592,263000018,5/21/2014,360000.0,3,2.5,1530,1131,3.0,0.0,0.0,...,8,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.5,2310,5813,2.0,0.0,0.0,...,8,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,7,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.5,1600,2388,2.0,,0.0,...,8,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287
21596,1523300157,10/15/2014,325000.0,2,0.75,1020,1076,2.0,0.0,0.0,...,7,1020,0.0,2008,0.0,98144,47.5941,-122.299,1020,1357


In [6]:
df.info()
# check the basic info of the data. Mainly numeric. 
# Date needs to be converted if we keep it. Why is sqft_basement and object?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
id               21597 non-null int64
date             21597 non-null object
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       19221 non-null float64
view             21534 non-null float64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null object
yr_built         21597 non-null int64
yr_renovated     17755 non-null float64
zipcode          21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
dtypes: float64(8), int64(11), object(2)
memory usage: 3.5+ MB


In [7]:
df.describe()
# check 5 point statistics. Things that jump out immediately: 
# #Bedrooms max 33; 
# sqft_lot sd>mean, max is very high; 
# floor 1-3.5; waterfront 0-1; view 0-4; condition 1-5; grade 3-13; build 1900-2015; renov - not sure
# zipcode, lat, log look as expected

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19221.0,21534.0,21597.0,21597.0,21597.0,21597.0,17755.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474000.0,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,0.007596,0.233863,3.409825,7.657915,1788.596842,1970.999676,83.636778,98077.951845,47.560093,-122.213982,1986.620318,12758.283512
std,2876736000.0,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,0.086825,0.765686,0.650546,1.1732,827.759761,29.375234,399.946414,53.513072,0.138552,0.140724,685.230472,27274.44195
min,1000102.0,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,370.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,1951.0,0.0,98033.0,47.4711,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,1975.0,0.0,98065.0,47.5718,-122.231,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,2210.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [8]:
df.isnull().sum()
# check for Nans: Waterfront, view, renov

id                  0
date                0
price               0
bedrooms            0
bathrooms           0
sqft_living         0
sqft_lot            0
floors              0
waterfront       2376
view               63
condition           0
grade               0
sqft_above          0
sqft_basement       0
yr_built            0
yr_renovated     3842
zipcode             0
lat                 0
long                0
sqft_living15       0
sqft_lot15          0
dtype: int64

In [9]:
print(df['bathrooms'].nunique())

29


In [10]:
# how many unique values does each column have?
for col in df:
    print(col, df[col].nunique())

id 21420
date 372
price 3622
bedrooms 12
bathrooms 29
sqft_living 1034
sqft_lot 9776
floors 6
waterfront 2
view 5
condition 5
grade 11
sqft_above 942
sqft_basement 304
yr_built 116
yr_renovated 70
zipcode 70
lat 5033
long 751
sqft_living15 777
sqft_lot15 8682


In [11]:
# practice code (not relevant for project)
uniq_col = []

for col in df:
    uniq_col.append(df[col].nunique())
uniq_col

[21420,
 372,
 3622,
 12,
 29,
 1034,
 9776,
 6,
 2,
 5,
 5,
 11,
 942,
 304,
 116,
 70,
 70,
 5033,
 751,
 777,
 8682]

In [12]:
# investigate the counts of columns without a high number of unique values. First make a list, and then get value counts

low_uniq_col = []

for col in df:
    if df[col].nunique() < 100:
        low_uniq_col.append(df[col].name)
low_uniq_col

['bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'yr_renovated',
 'zipcode']

In [13]:
# Column unique value counts

for col in df:
    if df[col].name in low_uniq_col:
        print('\n', col, '\n', df[col].value_counts(), df[col].value_counts().sum())



 bedrooms 
 3     9824
4     6882
2     2760
5     1601
6      272
1      196
7       38
8       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64 21597

 bathrooms 
 2.50    5377
1.00    3851
1.75    3048
2.25    2047
2.00    1930
1.50    1445
2.75    1185
3.00     753
3.50     731
3.25     589
3.75     155
4.00     136
4.50     100
4.25      79
0.75      71
4.75      23
5.00      21
5.25      13
5.50      10
1.25       9
6.00       6
5.75       4
0.50       4
8.00       2
6.25       2
6.75       2
6.50       2
7.50       1
7.75       1
Name: bathrooms, dtype: int64 21597

 floors 
 1.0    10673
2.0     8235
1.5     1910
3.0      611
2.5      161
3.5        7
Name: floors, dtype: int64 21597

 waterfront 
 0.0    19075
1.0      146
Name: waterfront, dtype: int64 19221

 view 
 0.0    19422
2.0      957
3.0      508
1.0      330
4.0      317
Name: view, dtype: int64 21534

 condition 
 3    14020
4     5677
5     1701
2      170
1       29
Name: condition, dt

In [14]:
# test code in next 3 cells. remove later

df['water_null'] = np.where(df['waterfront'].isnull(), 1, 0)

In [15]:
df['water_null'].sum()

2376

In [16]:
df['water_null'].describe()

count    21597.000000
mean         0.110015
std          0.312916
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: water_null, dtype: float64

In [17]:
df['water_1'] = np.where(df['waterfront'] >0, df['price'], 0)

In [18]:
df['water_1'].describe()

count    2.159700e+04
mean     1.160871e+04
std      1.691426e+05
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      7.060000e+06
Name: water_1, dtype: float64

**investigate sqft_basement** - why is this column an object?
First, I'll look at unique values. Then I noticed that we have total living space and above living space. We could potentially calculate the missing basement values using those features

In [19]:
df.sqft_basement.unique()
# There's the problem. 5th item is  '?'

array(['0.0', '400.0', '910.0', '1530.0', '?', '730.0', '1700.0', '300.0',
       '970.0', '760.0', '720.0', '700.0', '820.0', '780.0', '790.0',
       '330.0', '1620.0', '360.0', '588.0', '1510.0', '410.0', '990.0',
       '600.0', '560.0', '550.0', '1000.0', '1600.0', '500.0', '1040.0',
       '880.0', '1010.0', '240.0', '265.0', '290.0', '800.0', '540.0',
       '710.0', '840.0', '380.0', '770.0', '480.0', '570.0', '1490.0',
       '620.0', '1250.0', '1270.0', '120.0', '650.0', '180.0', '1130.0',
       '450.0', '1640.0', '1460.0', '1020.0', '1030.0', '750.0', '640.0',
       '1070.0', '490.0', '1310.0', '630.0', '2000.0', '390.0', '430.0',
       '850.0', '210.0', '1430.0', '1950.0', '440.0', '220.0', '1160.0',
       '860.0', '580.0', '2060.0', '1820.0', '1180.0', '200.0', '1150.0',
       '1200.0', '680.0', '530.0', '1450.0', '1170.0', '1080.0', '960.0',
       '280.0', '870.0', '1100.0', '460.0', '1400.0', '660.0', '1220.0',
       '900.0', '420.0', '1580.0', '1380.0', '475.0', 

In [20]:
df.sqft_basement[0:8]
# Here's another view, direct from the column

0       0.0
1     400.0
2       0.0
3     910.0
4       0.0
5    1530.0
6         ?
7       0.0
Name: sqft_basement, dtype: object

In [21]:
# test code (not required for project. I'm interested in discovering an easier way to identify rouge data enteries
#     that force a numeric column into a object. The decimal makes the real numeric values harder to separate.
#     May need to use str.replace)

s1 = pd.Series(['one', 'x', '1', '15.0', '2.4'])
s1.str.isnumeric()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
# Create a new feature to calculate basement from total living area - above living area
df['basement2'] = df['sqft_living'] - df['sqft_above']

In [23]:
# A second new feature, that takes the numerical values from sqft_basement. Will keep the original for now to use as a filter
df['basement3'] = np.where(df['sqft_basement'] != '?', df['sqft_basement'], 0)

In [24]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,water_null,water_1,basement2,basement3
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,0.0,98178,47.5112,-122.257,1340,5650,1,0.0,0,0.0
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,1991.0,98125,47.721,-122.319,1690,7639,0,0.0,400,400.0
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,,98028,47.7379,-122.233,2720,8062,0,0.0,0,0.0
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,0.0,98136,47.5208,-122.393,1360,5000,0,0.0,910,910.0
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,0.0,98074,47.6168,-122.045,1800,7503,0,0.0,0,0.0


In [25]:
# The new feature takes on the type property as an object. Need to convert it. Check the stats. Mainly zeros
df['basement3'] = df['basement3'].astype(float).astype(int, copy=False)
df.basement3.describe()

count    21597.000000
mean       285.716581
std        439.819830
min          0.000000
25%          0.000000
50%          0.000000
75%        550.000000
max       4820.000000
Name: basement3, dtype: float64

In [26]:
# Now the test. If we filter out the rows with '?', is the calculated feature the same as the orignal?
df['test'] = np.where(df['sqft_basement'] != '?',df['basement2'] - df['basement3'],0)

In [27]:
# Sum should be zero. It is!
df['test'].sum()

0

In [28]:
# Can now drop the original, and placeholder columns
# df.drop("Unnamed: 0", axis=1, inplace=True)

**Run some visuals** 

Which graphs?

**Clean up Nan's** 

Waterfront - split database

Views - safe to replace

yr_renovate - safe to replace

drop 33 bedrooms

**Create split databases for Waterfront Nan and 0** 

First thought is to convert waterfront Nans to 0. Before we do that, let's check the 5 point stats

Will create split dataframe where waterfront = 0, waterfront = null, and waterfront = 1


In [29]:
# new dataframe for waterfront is null, using water_null column
df_water_null = df[df["water_null"] == 1]
print(df_water_null.shape)
df_water_null.describe()

(2376, 26)


Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,water_null,water_1,basement2,basement3,test
count,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,0.0,2370.0,2376.0,...,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0,2376.0
mean,4484801000.0,529430.1,3.361532,2.105745,2063.34133,15306.89,1.486532,,0.25654,3.40867,...,98079.694865,47.561465,-122.213497,1980.507576,12366.675084,1.0,0.0,297.146465,292.798822,0.0
std,2873451000.0,325079.4,0.908079,0.764266,881.380488,45954.95,0.535898,,0.783974,0.652826,...,53.949839,0.137431,0.14072,689.428551,24694.192,0.0,0.0,440.511772,438.564289,0.0
min,1000102.0,80000.0,1.0,0.75,430.0,600.0,1.0,,0.0,1.0,...,98001.0,47.1808,-122.503,780.0,788.0,1.0,0.0,0.0,0.0,0.0
25%,2112250000.0,322875.0,3.0,1.5,1430.0,5031.75,1.0,,0.0,3.0,...,98033.75,47.472925,-122.327,1470.0,5108.0,1.0,0.0,0.0,0.0,0.0
50%,3789200000.0,445000.0,3.0,2.25,1900.0,7607.0,1.5,,0.0,3.0,...,98070.0,47.5714,-122.228,1820.0,7624.5,1.0,0.0,0.0,0.0,0.0
75%,7227501000.0,649612.5,4.0,2.5,2520.0,10524.75,2.0,,0.0,4.0,...,98118.0,47.678,-122.12575,2370.0,10001.25,1.0,0.0,600.0,590.0,0.0
max,9839301000.0,3200000.0,8.0,6.5,7730.0,1074218.0,3.5,,4.0,5.0,...,98199.0,47.7774,-121.319,4940.0,311610.0,1.0,0.0,2160.0,2160.0,0.0


In [30]:
# new dataframes for waterfront 0 and 1
df_water_no = df[df["waterfront"] == 0]
df_water_yes = df[df["waterfront"] == 1]

In [31]:
print(df_water_no.shape)
df_water_no.describe()

(19075, 26)


Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,water_null,water_1,basement2,basement3,test
count,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19019.0,19075.0,...,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0,19075.0
mean,4593352000.0,532642.0,3.374836,2.112543,2073.524404,14991.12,1.493997,0.0,0.203954,3.408965,...,98077.59114,47.560078,-122.213552,1982.052949,12729.486186,0.0,0.0,287.649279,281.445662,0.0
std,2877189000.0,344959.2,0.927144,0.764537,909.083746,40777.0,0.53998,0.0,0.698385,0.649431,...,53.373727,0.138857,0.14074,680.982492,27593.553043,0.0,0.0,437.333169,434.259744,0.0
min,1000102.0,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,...,98001.0,47.1559,-122.519,399.0,651.0,0.0,0.0,0.0,0.0,0.0
25%,2124079000.0,320000.0,3.0,1.75,1420.0,5036.5,1.0,0.0,0.0,3.0,...,98032.0,47.4712,-122.328,1490.0,5100.0,0.0,0.0,0.0,0.0,0.0
50%,3905081000.0,450000.0,3.0,2.25,1910.0,7589.0,1.5,0.0,0.0,3.0,...,98065.0,47.5722,-122.23,1840.0,7600.0,0.0,0.0,0.0,0.0,0.0
75%,7334401000.0,638600.0,4.0,2.5,2540.0,10584.0,2.0,0.0,0.0,4.0,...,98117.0,47.6782,-122.125,2350.0,10031.5,0.0,0.0,550.0,530.0,0.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,0.0,4.0,5.0,...,98199.0,47.7776,-121.315,6210.0,871200.0,0.0,0.0,4130.0,4130.0,0.0


In [32]:
print(df_water_yes.shape)
df_water_yes.describe()

(146, 26)


Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,water_null,water_1,basement2,basement3,test
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,145.0,146.0,...,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,4454969000.0,1717215.0,3.349315,2.708904,3244.753425,25870.883562,1.630137,1.0,3.786207,3.541096,...,98096.712329,47.539677,-122.278164,2682.828767,22893.69863,0.0,1717215.0,735.993151,728.458904,0.0
std,2858045000.0,1145385.0,1.099169,1.121509,1652.134956,44629.075899,0.547895,0.0,0.555105,0.743957,...,60.599756,0.113744,0.123772,808.410448,23264.085993,0.0,1145385.0,803.728076,808.010866,0.0
min,121029000.0,285000.0,1.0,0.75,440.0,1989.0,1.0,1.0,1.0,1.0,...,98004.0,47.3276,-122.514,750.0,3230.0,0.0,285000.0,0.0,0.0,0.0
25%,2016369000.0,827500.0,3.0,1.8125,2082.5,11692.25,1.0,1.0,4.0,3.0,...,98040.0,47.45255,-122.376,2075.0,11991.5,0.0,827500.0,0.0,0.0,0.0
50%,3692780000.0,1510000.0,3.0,2.5,2900.0,17730.5,2.0,1.0,4.0,3.0,...,98075.0,47.54815,-122.2735,2695.0,16260.5,0.0,1510000.0,560.0,535.0,0.0
75%,7390450000.0,2282500.0,4.0,3.25,4117.5,26692.5,2.0,1.0,4.0,4.0,...,98155.0,47.60755,-122.21,3140.0,24537.5,0.0,2282500.0,1265.0,1265.0,0.0
max,9808701000.0,7060000.0,6.0,6.75,10040.0,505166.0,3.0,1.0,4.0,5.0,...,98199.0,47.7729,-122.059,4913.0,192268.0,0.0,7060000.0,4820.0,4820.0,0.0


In [33]:
# Here's an alternative way to look at 5 point stats for waterfront without the need to create new databases

df.iloc[:, 1:9].groupby(['waterfront']).describe().transpose()

Unnamed: 0,waterfront,0.0,1.0
price,count,19075.0,146.0
price,mean,532642.0,1717215.0
price,std,344959.2,1145385.0
price,min,78000.0,285000.0
price,25%,320000.0,827500.0
price,50%,450000.0,1510000.0
price,75%,638600.0,2282500.0
price,max,7700000.0,7060000.0
bedrooms,count,19075.0,146.0
bedrooms,mean,3.374836,3.349315


In [34]:
# or looking at just the mean in a couple of columns
df.groupby('waterfront')['price', 'bedrooms'].mean()

Unnamed: 0_level_0,price,bedrooms
waterfront,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,532642.0,3.374836
1.0,1717215.0,3.349315


In [35]:
# The statistics for waterfront = 0, and waterfront = null look very similar. Ok to change NaNs to zero
df.waterfront = df.waterfront.fillna(0)

In [36]:
print(df['waterfront'].value_counts())

0.0    21451
1.0      146
Name: waterfront, dtype: int64


In [37]:
# view only has 63 null values. Ok to change NaNs to zero
df.view = df.view.fillna(0)

In [38]:
df.view.value_counts()

0.0    19485
2.0      957
3.0      508
1.0      330
4.0      317
Name: view, dtype: int64

In [39]:
# yr_renovated requires a year for the renovation. The vast majority of this column is zero. 
# With no additional information to go on, it's safest to change Nan's to zero

df.yr_renovated = df.yr_renovated.fillna(0)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 26 columns):
id               21597 non-null int64
date             21597 non-null object
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       21597 non-null float64
view             21597 non-null float64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null object
yr_built         21597 non-null int64
yr_renovated     21597 non-null float64
zipcode          21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
water_null       21597 non-null int32
water_1          21597 non-null float

**Drop Columns**

Now that we've cleaned up NaN's and '?'s. Let's drop some columns

In [41]:
df = df.drop(['id', 'date'], axis =1)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode,lat,long,sqft_living15,sqft_lot15,water_null,water_1,basement2,basement3,test
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,...,98178,47.5112,-122.257,1340,5650,1,0.0,0,0,0
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,...,98125,47.721,-122.319,1690,7639,0,0.0,400,400,0
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,...,98028,47.7379,-122.233,2720,8062,0,0.0,0,0,0
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,...,98136,47.5208,-122.393,1360,5000,0,0.0,910,910,0
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,...,98074,47.6168,-122.045,1800,7503,0,0.0,0,0,0
