In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from matplotlib import pyplot as plt
import seaborn as sns



plt.style.use('ggplot')
#increase font size of all elements
# sns.set_theme(font_scale=1.5)


## Loading a dataset
[Dataset: House Sales in King County, USA](https://www.kaggle.com/datasets/harlfoxem/housesalesprediction)

In [2]:
house_data = pd.read_csv(
    'data/kc_house_data.csv', 
    dtype=dict(
        # waterfront="int",
        zipcode="str",
    ),
    parse_dates=[1])
house_data

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,2015-02-25,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,2014-12-09,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2015-02-18,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,2014-05-21,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,2015-02-23,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,2014-06-23,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,2015-01-16,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [3]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21613 non-null  int64         
 1   date           21613 non-null  datetime64[ns]
 2   price          21613 non-null  float64       
 3   bedrooms       21613 non-null  int64         
 4   bathrooms      21613 non-null  float64       
 5   sqft_living    21613 non-null  int64         
 6   sqft_lot       21613 non-null  int64         
 7   floors         21613 non-null  float64       
 8   waterfront     21613 non-null  int64         
 9   view           21613 non-null  int64         
 10  condition      21613 non-null  int64         
 11  grade          21613 non-null  int64         
 12  sqft_above     21613 non-null  int64         
 13  sqft_basement  21613 non-null  int64         
 14  yr_built       21613 non-null  int64         
 15  yr_renovated   2161

## Marking NA as NA/NaN

In [4]:
# zero seems to be a placeholder for missing data in some columns
house_data.apply(lambda _: _.isin([0]).sum(), axis=0)

id                   0
date                 0
price                0
bedrooms            13
bathrooms           10
sqft_living          0
sqft_lot             0
floors               0
waterfront       21450
view             19489
condition            0
grade                0
sqft_above           0
sqft_basement    13126
yr_built             0
yr_renovated     20699
zipcode              0
lat                  0
long                 0
sqft_living15        0
sqft_lot15           0
dtype: int64

In [5]:
# in the columns, where it makes sense, replacing 0 with NaN
for columns in ['bedrooms', 'bathrooms']:
    house_data.loc[:,columns] = (
        house_data.loc[:,columns]
        .replace(0, np.NaN))

# Selecting and creating features

In [6]:
data = house_data.filter(items=['lat', 'long', 'view', 'condition', 'waterfront', 'yr_built', 'floors'])

data.loc[:,'price_per_sqft'] = house_data.price/house_data.sqft_living
data.loc[:,'living_lot_ratio'] = house_data.sqft_living/house_data.sqft_lot
data.loc[:,'basement_ratio'] = house_data.sqft_basement/house_data.sqft_living
data.loc[:,'recently_renovated'] = (
    (house_data.yr_renovated > 2010).astype('int'))
data.loc[:, 'bath_bed_ratio'] = house_data.bathrooms/house_data.bedrooms

data = data.dropna(axis=1, how='any')

# Predict price/sqft with linear model

In [8]:
import statsmodels.api as sm

results = sm.OLS(data.price_per_sqft, data.drop(columns=['price_per_sqft'])).fit()
results.summary()

0,1,2,3
Dep. Variable:,price_per_sqft,R-squared (uncentered):,0.905
Model:,OLS,Adj. R-squared (uncentered):,0.905
Method:,Least Squares,F-statistic:,20600.0
Date:,"Wed, 12 Apr 2023",Prob (F-statistic):,0.0
Time:,17:01:31,Log-Likelihood:,-127470.0
No. Observations:,21613,AIC:,255000.0
Df Residuals:,21603,BIC:,255000.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
lat,299.9614,4.266,70.313,0.000,291.600,308.323
long,94.4973,1.714,55.134,0.000,91.138,97.857
view,27.8307,0.875,31.790,0.000,26.115,29.547
condition,8.1816,1.004,8.148,0.000,6.213,10.150
waterfront,168.8831,7.585,22.266,0.000,154.016,183.750
yr_built,-1.2594,0.024,-53.242,0.000,-1.306,-1.213
floors,-14.0329,1.650,-8.506,0.000,-17.266,-10.799
living_lot_ratio,97.3660,2.874,33.880,0.000,91.733,102.999
basement_ratio,-139.1768,4.104,-33.912,0.000,-147.221,-131.132

0,1,2,3
Omnibus:,5066.819,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15659.479
Skew:,1.204,Prob(JB):,0.0
Kurtosis:,6.405,Cond. No.,25000.0
