In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

# DATA Understanding 

In [5]:
! ls ../data


column_names.md   kc_house_data.csv


In [294]:
df = pd.read_csv('../data/kc_house_data.csv')

In [295]:
df.shape

(21597, 21)

we have the information of 21597 houses located in King county washington 

In [284]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21420 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21420 non-null  int64  
 1   date           21420 non-null  object 
 2   price          21420 non-null  float64
 3   bedrooms       21420 non-null  int64  
 4   bathrooms      21420 non-null  float64
 5   sqft_living    21420 non-null  int64  
 6   sqft_lot       21420 non-null  int64  
 7   floors         21420 non-null  float64
 8   waterfront     21420 non-null  object 
 9   view           21357 non-null  object 
 10  condition      21420 non-null  object 
 11  grade          21420 non-null  object 
 12  sqft_above     21420 non-null  int64  
 13  sqft_basement  21420 non-null  object 
 14  yr_built       21420 non-null  int64  
 15  yr_renovated   17607 non-null  float64
 16  zipcode        21420 non-null  int64  
 17  lat            21420 non-null  float64
 18  long  

We can see above, dataset has null values in some columns

### ID column

duplicate values on 'id' column, some houses show up more than once on dataset

In [190]:
df['id'].value_counts()

795000620     3
1825069031    2
2019200220    2
7129304540    2
1781500435    2
             ..
7812801125    1
4364700875    1
3021059276    1
880000205     1
1777500160    1
Name: id, Length: 21420, dtype: int64

In [185]:
df.loc[df["id"]==2044500213]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15787,2044500213,6/17/2014,310000.0,4,2.0,1870,6000,1.5,NO,NONE,...,7 Average,1870,0.0,1956,,98125,47.7155,-122.315,1520,7169
15788,2044500213,1/26/2015,449000.0,4,2.0,1870,6000,1.5,NO,NONE,...,7 Average,1870,0.0,1956,0.0,98125,47.7155,-122.315,1520,7169


House appering 2 or more times, has different price changing over time, will consider keeping latest price, or keeping duplicates since they can be trated as sale data points, not houses alone

### Bedrooms

In [221]:
df['bedrooms'].value_counts()

3     9732
4     6850
2     2736
5     1586
6      265
1      191
7       38
8       13
9        6
10       3
Name: bedrooms, dtype: int64

house with 33 bedrooms, looked for id number and found out it has only 3 bed rooms, value will be replaced, same for house with 11 rooms, zillow shows it has only 4 rooms.

In [217]:
df.loc[df["bedrooms"]==33]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15856,2402100895,6/25/2014,640000.0,33,1.75,1620,6000,1.0,NO,NONE,...,7 Average,1040,580.0,1947,0.0,98103,47.6878,-122.331,1330,4700


### Waterfront column

above we can see on waterfront value, the 3 row has null value, for a NO response

In [238]:
# Whether the house is on a waterfront
#   * Includes Duwamish, Elliott Bay, Puget Sound, Lake Union, Ship Canal, 
#     Lake Washington, Lake Sammamish, other lake, and river/slough waterfronts

# column has 2353 null values, those might be taken as NO 

#categorical, has some null values
df['waterfront'].value_counts()

NO     18921
YES      146
Name: waterfront, dtype: int64

In [239]:
df['waterfront'].isnull().sum()

2353

In [243]:
df['waterfront'].mode()

0    NO
dtype: object

### Date

In [None]:
data set goes back to 2014 and most recent data points are from 2015

In [208]:
# Dates
df['date'].value_counts()

6/23/2014    142
6/26/2014    131
6/25/2014    131
7/8/2014     127
4/27/2015    126
            ... 
3/8/2015       1
1/17/2015      1
5/27/2015      1
5/15/2015      1
1/31/2015      1
Name: date, Length: 372, dtype: int64

### View

Categorical data

In [209]:
# Quality of view from house
# categorical data
df['view'].value_counts()

NONE         19422
AVERAGE        957
GOOD           508
FAIR           330
EXCELLENT      317
Name: view, dtype: int64

### Condition

In [22]:
# categorical data
df['condition'].value_counts()

Average      14020
Good          5677
Very Good     1701
Fair           170
Poor            29
Name: condition, dtype: int64

### Grade

In [23]:
# categorical data
df['grade'].value_counts()

7 Average        8974
8 Good           6065
9 Better         2615
6 Low Average    2038
10 Very Good     1134
11 Excellent      399
5 Fair            242
12 Luxury          89
4 Low              27
13 Mansion         13
3 Poor              1
Name: grade, dtype: int64

### sqft_basement

In [24]:
# numeric data needed to convert from str to int data type
df['sqft_basement'].value_counts()

0.0       12826
?           454
600.0       217
500.0       209
700.0       208
          ...  
1960.0        1
2390.0        1
1135.0        1
1798.0        1
2570.0        1
Name: sqft_basement, Length: 304, dtype: int64

/




# Data Preparation

### Cleaning ID column
will drop duplicate values from id column, keeping most recent record of the house

In [296]:
df.drop_duplicates(subset='id', keep='last', inplace=True)

In [297]:
df.loc[df["id"]==2044500213]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15788,2044500213,1/26/2015,449000.0,4,2.0,1870,6000,1.5,NO,NONE,...,7 Average,1870,0.0,1956,0.0,98125,47.7155,-122.315,1520,7169


### cleaning bedroom column

will replace outlier/incorrect values from bedrooms column

In [298]:
df.loc[df["bedrooms"]==33, 'bedrooms'] = 3
df.loc[df["bedrooms"]==11, 'bedrooms'] = 4

### Cleaning Waterfront column

In [299]:
df['waterfront'].value_counts()

NO     18921
YES      146
Name: waterfront, dtype: int64

will change NaN values to NO since, since it is the most frequent value under that column

In [300]:
df['waterfront'].isnull().sum()

2353

In [301]:
df['waterfront'] = df['waterfront'].replace(np.nan, 'NO')


In [302]:
df['waterfront'].value_counts()[0]+df['waterfront'].value_counts()[1]

21420

### Cleaning view column

In [303]:
df['view'].value_counts()

NONE         19253
AVERAGE        956
GOOD           505
FAIR           329
EXCELLENT      314
Name: view, dtype: int64

In [304]:
df['view'].isna().sum()

63

In [305]:
df['view'] = df['view'].replace(np.nan, 'NONE')

In [308]:
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.00,1180,5650,1.0,NO,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.00,770,10000,1.0,NO,NONE,...,6 Low Average,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.00,1680,8080,1.0,NO,NONE,...,8 Good,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,5/21/2014,360000.0,3,2.50,1530,1131,3.0,NO,NONE,...,8 Good,1530,0.0,2009,0.0,98103,47.6993,-122.346,1530,1509
21593,6600060120,2/23/2015,400000.0,4,2.50,2310,5813,2.0,NO,NONE,...,8 Good,2310,0.0,2014,0.0,98146,47.5107,-122.362,1830,7200
21594,1523300141,6/23/2014,402101.0,2,0.75,1020,1350,2.0,NO,NONE,...,7 Average,1020,0.0,2009,0.0,98144,47.5944,-122.299,1020,2007
21595,291310100,1/16/2015,400000.0,3,2.50,1600,2388,2.0,NO,NONE,...,8 Good,1600,0.0,2004,0.0,98027,47.5345,-122.069,1410,1287


### Cleaning sqft_basement column

Will proceed and transform data on sqft_basement column from str to a int type

In [321]:
df['sqft_basement'].value_counts()

0.0       12716
?           452
600.0       216
700.0       206
500.0       206
          ...  
1960.0        1
2390.0        1
1135.0        1
1798.0        1
2570.0        1
Name: sqft_basement, Length: 304, dtype: int64

In [313]:
df['sqft_basement'].isna().sum()

0

no Nan values,  '?' is a frequent value on the column

In [323]:
df.head(10)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,NO,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,6 Low Average,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,8 Good,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503
5,7237550310,5/12/2014,1230000.0,4,4.5,5420,101930,1.0,NO,NONE,...,11 Excellent,3890,1530.0,2001,0.0,98053,47.6561,-122.005,4760,101930
6,1321400060,6/27/2014,257500.0,3,2.25,1715,6819,2.0,NO,NONE,...,7 Average,1715,0.0,1995,0.0,98003,47.3097,-122.327,2238,6819
7,2008000270,1/15/2015,291850.0,3,1.5,1060,9711,1.0,NO,NONE,...,7 Average,1060,0.0,1963,0.0,98198,47.4095,-122.315,1650,9711
8,2414600126,4/15/2015,229500.0,3,1.0,1780,7470,1.0,NO,NONE,...,7 Average,1050,730.0,1960,0.0,98146,47.5123,-122.337,1780,8113
9,3793500160,3/12/2015,323000.0,3,2.5,1890,6560,2.0,NO,NONE,...,7 Average,1890,0.0,2003,0.0,98038,47.3684,-122.031,2390,7570


Something I noticed is how  sqft_living value is the total sqft living area in the house + basement, 
and if the basement column shows 0.0 or '?' sqft_living is same as sqft_above, 
so we should consider changing '?' to 0.0, and have a  full numeric column we can work with later on


In [322]:
# Changed all '?' to '0.0'
df.loc[df['sqft_basement'] == '?', 'sqft_basement'] = '0.0'

will change object type data to int

In [None]:
df['sqft_basement'] = df['sqft_basement'].astype(int)

#  Exploration before modeling

# ignore, scratch code below

<!-- will drop null values on all rows -->


In [169]:
df['id'].value_counts()

795000620     3
1825069031    2
2019200220    2
7129304540    2
1781500435    2
             ..
7812801125    1
4364700875    1
3021059276    1
880000205     1
1777500160    1
Name: id, Length: 21420, dtype: int64

found houses sharing id number, will drop duplicate houses.

In [174]:
df.drop_duplicates(subset= 'id', inplace=True)

## 'sqft_basement'

Will proceed and transform data on sqft_basement column from str 
to a int type and later verify if there could be a correlation b/t sqft_basement and price

454 rows contain '?' as a value under sqft_basement column, will change it to null 

In [126]:
df.loc[df['sqft_basement'] == '?', 'sqft_basement'] = None

In [127]:
df['sqft_basement'] = pd.to_numeric(df['sqft_basement'])

12826 houses in dataset dont have basement, im trying to take those rows off,

will run an experiment if houses with basement have a sgnificant different price than houses without basement

will perform a two sample t test 

sample 1: prices of houses with basement
sample 2: prices of houses without basement

a=.05
h0 = there is no significant difference in price b/t houses with basement and houses without it
h1 = there is a significant difference in price b/t houses with basement and houses without it


In [128]:
# for sample 2 im using > instead of != since there are null values of houses from which we dont have 
# information on its basement measurement, or existence 

sample_1 = df.loc[df['sqft_basement'] == 0.0]
sample_2 = df.loc[df['sqft_basement'] > 0]

In [129]:
sample_1['price'].mean()

489242.5007477035

In [130]:
sample_2['price'].mean()

622133.0138453931

In [131]:
stats.ttest_ind(sample_1['price'], sample_2['price'])

Ttest_indResult(statistic=-21.949553106321304, pvalue=3.5385895790914105e-105)

significantly different, reject null

checking correlation using sample 2, houses with basement 

In [132]:
sample_2.corr()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,1.0,-0.016422,0.008048,-0.00104,-0.018552,-0.128877,0.01429,-0.018753,-0.010305,0.009241,-0.004545,-0.027048,-0.040213,-0.002977,-0.006562,-0.122884
price,-0.016422,1.0,0.240382,0.52622,0.720111,0.065485,0.351925,0.724118,0.406756,-0.012259,0.16335,-0.049353,0.244171,0.015487,0.595574,0.050383
bedrooms,0.008048,0.240382,1.0,0.422378,0.475591,0.0439,0.073084,0.388634,0.42817,0.021832,0.051314,-0.130852,0.012622,0.115803,0.302028,0.035365
bathrooms,-0.00104,0.52622,0.422378,1.0,0.692903,0.120098,0.477753,0.691604,0.400565,0.391451,0.098669,-0.170318,0.032076,0.182116,0.500949,0.100498
sqft_living,-0.018552,0.720111,0.475591,0.692903,1.0,0.226542,0.335373,0.920151,0.716924,0.162184,0.098783,-0.194115,0.034116,0.229927,0.718499,0.208291
sqft_lot,-0.128877,0.065485,0.0439,0.120098,0.226542,1.0,-0.014339,0.207655,0.163834,0.089161,0.002539,-0.176208,-0.110738,0.329653,0.192046,0.791211
floors,0.01429,0.351925,0.073084,0.477753,0.335373,-0.014339,1.0,0.502799,-0.105329,0.285076,0.07953,0.090027,0.119561,-0.048568,0.185621,-0.020406
sqft_above,-0.018753,0.724118,0.388634,0.691604,0.920151,0.207655,0.502799,1.0,0.386698,0.210706,0.089848,-0.168424,0.047283,0.211465,0.680521,0.182318
sqft_basement,-0.010305,0.406756,0.42817,0.400565,0.716924,0.163834,-0.105329,0.386698,1.0,0.006828,0.072685,-0.15731,-0.003835,0.165022,0.480581,0.165959
yr_built,0.009241,-0.012259,0.021832,0.391451,0.162184,0.089161,0.285076,0.210706,0.006828,1.0,-0.232664,-0.286268,-0.144577,0.335604,0.181879,0.091062


In [116]:
sample_2

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.7210,-122.319,1690,7639
3,2487200875,12/9/2014,604000.0,4,3.00,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
5,7237550310,5/12/2014,1230000.0,4,4.50,5420,101930,1.0,NO,NONE,...,11 Excellent,3890,1530.0,2001,0.0,98053,47.6561,-122.005,4760,101930
8,2414600126,4/15/2015,229500.0,3,1.00,1780,7470,1.0,NO,NONE,...,7 Average,1050,730.0,1960,0.0,98146,47.5123,-122.337,1780,8113
10,1736800520,4/3/2015,662500.0,3,2.50,3560,9796,1.0,,NONE,...,8 Good,1860,1700.0,1965,0.0,98007,47.6007,-122.145,2210,8925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21572,2767600688,11/13/2014,414500.0,2,1.50,1210,1278,2.0,NO,NONE,...,8 Good,1020,190.0,2007,0.0,98117,47.6756,-122.375,1210,1118
21574,7430200100,5/14/2014,1220000.0,4,3.50,4910,9444,1.5,NO,NONE,...,11 Excellent,3110,1800.0,2007,0.0,98074,47.6502,-122.066,4560,11063
21579,1972201967,10/31/2014,520000.0,2,2.25,1530,981,3.0,NO,NONE,...,8 Good,1480,50.0,2006,,98103,47.6533,-122.346,1530,1282
21590,7936000429,3/26/2015,1010000.0,4,3.50,3510,7200,2.0,NO,NONE,...,9 Better,2600,910.0,2009,0.0,98136,47.5537,-122.398,2050,6200


In [149]:
# x = sample_2.drop(['id', 'price', 'grade', 'condition', 'view', 'date', 'waterfront', 'zipcode', 'yr_renovated', 
#                   'yr_built'], axis=1)

In [153]:
x2 = sm.add_constant(sample_2['sqft_basement'])

In [154]:
model = sm.OLS(endog=sample_2['price'], exog=x2).fit()

In [155]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.165
Model:,OLS,Adj. R-squared:,0.165
Method:,Least Squares,F-statistic:,1202.0
Date:,"Mon, 24 Oct 2022",Prob (F-statistic):,1.6e-240
Time:,13:00:00,Log-Likelihood:,-87055.0
No. Observations:,6067,AIC:,174100.0
Df Residuals:,6065,BIC:,174100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.873e+05,1.1e+04,26.088,0.000,2.66e+05,3.09e+05
sqft_basement,451.1749,13.011,34.676,0.000,425.668,476.682

0,1,2,3
Omnibus:,5317.542,Durbin-Watson:,1.928
Prob(Omnibus):,0.0,Jarque-Bera (JB):,296145.931
Skew:,3.95,Prob(JB):,0.0
Kurtosis:,36.303,Cond. No.,1760.0
