In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [None]:
cd ../data

In [None]:
df = pd.read_csv("kc_house_data.csv")
df.head()

Let's see what information we have:

In [None]:
df.info()

In [None]:
df.drop(labels=["sqft_above", "sqft_basement", "yr_renovated", "zipcode", "lat",
                "long","sqft_living15","sqft_lot15"], axis=1, inplace=True)

Now let's look for potential mistakes in other columns; we'll define some functions for repeated use and see what we're working with.

In [None]:
def hist(df, column):
    return plt.hist(x=df[column])
def hist_rooms(df, column):
    return plt.hist(x=df[column], bins=np.linspace(1, df[column].max()))
def extreme(df, column):
    return df.loc[df[column] == df[column].max()]

In [None]:
hist_rooms(df, "bedrooms");

Yikes, does a house really have thirty bedrooms?

In [None]:
extreme(df, "bedrooms")

In [None]:
#No way - and a google of the house's ID reveals this house only has three bedrooms - let's clean that up
df["bedrooms"].replace(to_replace=33, value=3, inplace=True)

In [None]:
df["bedrooms"].value_counts()

There's still some suspicious numbers, albeit less egregious. Let's take a look at the 11.

In [None]:
df.loc[df["bedrooms"] == 11]

Looking into this house, it appears this house actually has four bedrooms. Let's adjust that.

In [None]:
df["bedrooms"].replace(to_replace=11, value=4, inplace=True)

Much more reasonable - let's repeat the process with the other columns

In [None]:
hist_rooms(df, "bathrooms");

In [None]:
extreme(df, "bathrooms")

This is lots of bathrooms, but both houses are very large, so they're not unreasonable.

In [None]:
hist(df, "sqft_living");

Suspicious. Let's look into the biggest house.

In [None]:
extreme(df, "sqft_living")

Understandably, the largest house also is tied for most bathrooms - no problems here, it seems.

In [None]:
hist(df, "sqft_lot");

In [None]:
extreme(df, "sqft_lot")

A search reveals this is a farm, which is unusual, but valid.

In [None]:
hist_rooms(df, "floors");

In [None]:
#Set the conditions in order of quality so we can clearly see the shape
conditions = ["Poor", 'Fair', 'Average', 'Good', 'Very Good']
con_count = []
for x in conditions:
    con_count.append(df["condition"].value_counts()[x])

In [None]:
plt.bar(x=conditions, height=con_count);

In [None]:
#Repeating the process with grades
grades = ['3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', '10 Very Good',
          '11 Excellent', '12 Luxury', '13 Mansion']
grade_count = []
for x in grades:
    grade_count.append(df["grade"].value_counts()[x])

In [None]:
#For legibility, we'll do this one manually
plt.bar(x=grades, height=grade_count)
plt.xticks(rotation = 60);

In [None]:
hist(df, "yr_built");

Everything else seems reasonable. With the data cleaned, let's take a closer look.

## Living Area

In [None]:
def simple_model(x):
    y = df["price"]
    x_c = sm.add_constant(x)
    return sm.OLS(endog=y, exog=x_c).fit().summary()

In [None]:
def simple_ylog_model(x):
    y = np.log(df["price"])
    x_c = sm.add_constant(x)
    return sm.OLS(endog=y, exog=x_c).fit().summary()

In [None]:
def scat(x):
    y = df["price"]
    fig, ax = plt.subplots()
    ax.scatter(x=x, y=y)

In [None]:
simple_model(df["sqft_living"])

In [None]:
y = df["price"]
y.hist();

In [None]:
y_log = np.log(y)
y_log.hist();

In [None]:
x = df["sqft_living"]
x.hist();

In [None]:
x_log = np.log(x)
x_log.hist();

In [None]:
x_c = sm.add_constant(x)
x_log_c = sm.add_constant(x_log)
sm.OLS(endog=y_log, exog=x_log_c).fit().summary()

In [None]:
sm.OLS(endog=y, exog=x_log_c).fit().summary()

In [None]:
sm.OLS(endog=y_log, exog=x_c).fit().summary()

While using the log of either or both values weakens out model's accuracy, the minimalization of the omnibus and JB values is worth achieving. Using the log of the sale price had the greatest impact on these values with the highest r-value, so we'll be sure to take the log of y in our model.

## Lot Square Footage

In [None]:
simple_model(df["sqft_lot"])

In [None]:
simple_ylog_model(df["sqft_lot"])

In [None]:
df["sqft_lot"].hist();

In [None]:
lot_log = np.log(df["sqft_lot"])
lot_log.hist();

In [None]:
lot_log_c = sm.add_constant(lot_log)
simple_model(lot_log_c)

In [None]:
simple_ylog_model(lot_log_c)

## Bedrooms

In [None]:
simple_model(df["bedrooms"])

In [None]:
scat(df["bedrooms"])

In [None]:
simple_ylog_model(df["bedrooms"])

## Bathrooms

In [None]:
simple_model(df["bathrooms"])

In [None]:
scat(df["bathrooms"])

In [None]:
simple_ylog_model(df["bathrooms"])

## View

In [None]:
df["view"].value_counts()

In [None]:
df["view"].isna().sum()

In [None]:
view_dummies = pd.get_dummies(df["view"])
view_dummies

Because we know we have null values in this column, we'll let those serve as our "dropped column" for now.

In [None]:
simple_model(view_dummies)

What if we set the nulls as not having a view? As is, it appears that no view is weaker than an undocumented view, but let's take a look. By simply dropping the "none" column, the two will be viewed as equal by the model.

In [None]:
view_dummies2 = view_dummies.drop("NONE", axis=1)

In [None]:
simple_model(view_dummies2)

## Put these numbers together

In [None]:
def add_views(df):
    df["No_view"] = view_dummies["NONE"]
    df["Fair_view"] = view_dummies["FAIR"]
    df["Average_view"] = view_dummies["AVERAGE"]
    df["Good_view"] = view_dummies["GOOD"]
    df["Excellent_view"] = view_dummies["EXCELLENT"]

In [None]:
drew_data = df.drop(labels=["id","date","price","floors","waterfront","condition","grade","yr_built","view"], axis=1)
add_views(drew_data)
drew_data

In [None]:
x, y = sm.add_constant(drew_data), df["price"]

In [None]:
sm.OLS(y, x).fit().summary()

Let's see if using the log of y will improve our model.

In [None]:
y_log = np.log(y)
sm.OLS(y_log, x).fit().summary()

While the bathrooms showed a stronger correlation with the log of the price, the other factors prevented this from being the best overall choice. Now, let's address the multicolinearity. 

Let's scale this data before diving into the multicolinearity.

In [None]:
x_nums = drew_data.drop(labels=["No_view", "Fair_view", "Average_view", "Good_view", "Excellent_view"], axis=1)

In [None]:
x_scaled = StandardScaler().fit_transform(x_nums.values)

In [None]:
x_scaled_df = pd.DataFrame(x_scaled, index=x_nums.index, columns=x_nums.columns)

In [None]:
x_c = sm.add_constant(x_scaled_df)

In [None]:
sm.OLS(y, x_c).fit().summary()

Now that our condition number is minimal, if it reappears with the view columns, we'll know they're to blame. Let's add them in now!

In [None]:
add_views(x_c)

In [None]:
sm.OLS(y, x_c).fit().summary()

We have some high p-values in the view column; let's try dropping the column for no view and see if it improves the model.

In [None]:
x_c2 = x_c.drop("No_view", axis=1)

In [None]:
sm.OLS(y, x_c2).fit().summary()

We see no change to adj. r-squared, and the high p-values are gone; let's go ahead and continue leaving out this column.

In [None]:
sm.OLS(y_log, x_c2).fit().summary()

## Checking Assumptions

In [None]:
model = sm.OLS(y_log, x_c2).fit()

In [None]:
model.summary().tables[2]

In [None]:
model_preds = model.predict(x_c2)
resids = y_log - model_preds

In [None]:
fig, ax = plt.subplots()

ax.scatter(y, resids);

In [None]:
model2 = sm.OLS(endog=y_log, exog=x_c2).fit()
model2_preds = model2.predict(x_c2)
model2_resid = y_log - model2_preds

fig, ax = plt.subplots()
ax.scatter(y_log, model2_resid);

In [None]:
sm.qqplot(model2_resid, line='r');

## What do we do with the datesssss

In [None]:
df["id"].value_counts().head(177)

In [None]:
df.loc[df["id"] == 795000620]

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
df.loc[df["date"] == df["date"].max()]

## Just Essentials!

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [2]:
cd ../data

C:\Users\drewh\Documents\Flatiron Documents\Phase 2\Housing-Sales-Analysis\data


In [3]:
df = pd.read_csv("kc_house_data.csv")

In [4]:
df.drop(labels=["sqft_above", "sqft_basement", "yr_renovated", "zipcode", "lat",
                "long","sqft_living15","sqft_lot15"], axis=1, inplace=True)

In [5]:
def add_views(df):
    df["No_view"] = view_dummies["NONE"]
    df["Fair_view"] = view_dummies["FAIR"]
    df["Average_view"] = view_dummies["AVERAGE"]
    df["Good_view"] = view_dummies["GOOD"]
    df["Excellent_view"] = view_dummies["EXCELLENT"]

In [6]:
view_dummies = pd.get_dummies(df["view"])
view_dummies

Unnamed: 0,AVERAGE,EXCELLENT,FAIR,GOOD,NONE
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
21592,0,0,0,0,1
21593,0,0,0,0,1
21594,0,0,0,0,1
21595,0,0,0,0,1


In [7]:
x_nums = df.drop(labels=["id","date","price","floors","waterfront","condition","grade","yr_built","view"], axis=1)

In [8]:
x_scaled = StandardScaler().fit_transform(x_nums.values)
x_scaled_df = pd.DataFrame(x_scaled, index=x_nums.index, columns=x_nums.columns)
x_c = sm.add_constant(x_scaled_df)

In [9]:
add_views(x_c)

In [10]:
y = df["price"]
y_log = np.log(y)

In [11]:
sm.OLS(y, x_c).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.554
Model:,OLS,Adj. R-squared:,0.554
Method:,Least Squares,F-statistic:,2979.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,10:49:07,Log-Likelihood:,-298670.0
No. Observations:,21597,AIC:,597400.0
Df Residuals:,21587,BIC:,597400.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.777e+05,3.09e+04,18.683,0.000,5.17e+05,6.38e+05
bedrooms,-4.575e+04,2085.865,-21.934,0.000,-4.98e+04,-4.17e+04
bathrooms,8097.9766,2582.599,3.136,0.002,3035.892,1.32e+04
sqft_living,2.595e+05,2834.621,91.563,0.000,2.54e+05,2.65e+05
sqft_lot,-1.568e+04,1706.540,-9.190,0.000,-1.9e+04,-1.23e+04
No_view,-5.88e+04,3.1e+04,-1.899,0.058,-1.2e+05,1904.171
Fair_view,1.015e+05,3.37e+04,3.008,0.003,3.54e+04,1.68e+05
Average_view,6.15e+04,3.19e+04,1.926,0.054,-1082.339,1.24e+05
Good_view,1.472e+05,3.28e+04,4.486,0.000,8.29e+04,2.12e+05

0,1,2,3
Omnibus:,13656.316,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,504972.039
Skew:,2.491,Prob(JB):,0.0
Kurtosis:,26.159,Cond. No.,69.0


This model came back with a couple high alpha values - I tried dropping the "No_view" column, which got rid of them without reducing r-squared.

In [12]:
x_c2 = x_c.drop("No_view", axis=1)

In [13]:
sm.OLS(y, x_c2).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.554
Model:,OLS,Adj. R-squared:,0.554
Method:,Least Squares,F-statistic:,3351.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,10:49:08,Log-Likelihood:,-298670.0
No. Observations:,21597,AIC:,597400.0
Df Residuals:,21588,BIC:,597400.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.191e+05,1765.845,293.955,0.000,5.16e+05,5.23e+05
bedrooms,-4.577e+04,2085.975,-21.940,0.000,-4.99e+04,-4.17e+04
bathrooms,8061.4267,2582.683,3.121,0.002,2999.177,1.31e+04
sqft_living,2.596e+05,2834.420,91.600,0.000,2.54e+05,2.65e+05
sqft_lot,-1.568e+04,1706.640,-9.186,0.000,-1.9e+04,-1.23e+04
Fair_view,1.601e+05,1.37e+04,11.705,0.000,1.33e+05,1.87e+05
Average_view,1.201e+05,8230.572,14.589,0.000,1.04e+05,1.36e+05
Good_view,2.058e+05,1.12e+04,18.327,0.000,1.84e+05,2.28e+05
Excellent_view,5.866e+05,1.42e+04,41.368,0.000,5.59e+05,6.14e+05

0,1,2,3
Omnibus:,13680.003,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,508827.083
Skew:,2.495,Prob(JB):,0.0
Kurtosis:,26.249,Cond. No.,12.9


My final model using the log of y:

In [14]:
sm.OLS(y_log, x_c2).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.516
Model:,OLS,Adj. R-squared:,0.516
Method:,Least Squares,F-statistic:,2877.0
Date:,"Wed, 26 Oct 2022",Prob (F-statistic):,0.0
Time:,10:49:10,Log-Likelihood:,-8956.2
No. Observations:,21597,AIC:,17930.0
Df Residuals:,21588,BIC:,18000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.0220,0.003,4939.362,0.000,13.017,13.027
bedrooms,-0.0418,0.003,-13.420,0.000,-0.048,-0.036
bathrooms,0.0438,0.004,11.362,0.000,0.036,0.051
sqft_living,0.3355,0.004,79.272,0.000,0.327,0.344
sqft_lot,-0.0134,0.003,-5.258,0.000,-0.018,-0.008
Fair_view,0.2529,0.020,12.387,0.000,0.213,0.293
Average_view,0.1990,0.012,16.196,0.000,0.175,0.223
Good_view,0.2582,0.017,15.406,0.000,0.225,0.291
Excellent_view,0.5099,0.021,24.085,0.000,0.468,0.551

0,1,2,3
Omnibus:,13.482,Durbin-Watson:,1.973
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.437
Skew:,-0.031,Prob(JB):,0.000733
Kurtosis:,3.111,Cond. No.,12.9
