In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy 
import sklearn

In [2]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [3]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from math import sqrt

In [4]:
# read data
kc_data = pd.read_csv('dsc-phase-2-project/data/kc_house_data.csv')
kc_columns = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
              'condition', 'grade', 'sqft_above', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15',
              'sqft_lot15']

In [5]:
# model cannot handle dates? 
# cannot find a singe question mark by searching but getting errors so swapping for 0
# honestly 0 fills all pretty well, it's the mode of most of them and means 'none' for things like basements, which aren't 
# common here 
kc_data = kc_data.drop('date', axis=1).copy() 
kc_data = kc_data.replace({ '?' : '0'}).copy()
kc_data = kc_data.fillna('0').copy()

In [6]:
# seperate your x and y, here I am looking at price 
kcy = kc_data['price'].to_frame()
kcx = kc_data.drop('price', axis=1)

In [7]:
# initialize regression
reg = linear_model.LinearRegression()

In [8]:
# split data into 80/20 training/testing
x_train, x_test, y_train, y_test = train_test_split(kcx, kcy, test_size=0.2, random_state=42)

In [9]:
# train the model 
reg.fit(x_train, y_train)

LinearRegression()

In [10]:
# print coefficients for each feat/column 
print(reg.coef_)

[[-1.71183099e-06 -3.87102866e+04  4.24379327e+04  9.95958833e+01
   9.12212431e-02  9.31481087e+03  6.02923502e+05  5.28609637e+04
   2.84234672e+04  9.87735440e+04  7.97598606e+01  5.37437848e+01
  -2.69685673e+03  2.19609922e+01 -5.63773262e+02  5.87783507e+05
  -2.08042061e+05  2.48641756e+01 -4.64959942e-01]]


In [11]:
# print test data predictions 
y_pred = reg.predict(x_test)
y_pred

array([[ 78156.17737196],
       [388055.85400898],
       [197676.12032759],
       ...,
       [504802.81793527],
       [319667.772386  ],
       [210518.39535254]])

In [12]:
#print vaalues
y_test

Unnamed: 0,price
3686,132500.0
10247,415000.0
4037,494000.0
3437,355000.0
19291,606000.0
...,...
2522,175000.0
11119,394950.0
11973,437000.0
17601,293000.0


In [13]:
# check accuracy with mean sq 
print(np.mean((y_pred - y_test)**2))

price    3.988591e+10
dtype: float64


In [14]:
# check accuracy with mean sq 
print(mean_squared_error(y_test, y_pred))
# very innaccurate? 

39885914010.15816


In [15]:
def calc_slope(X, Y):
    slope = ((X*Y).mean() - X.mean()*Y.mean()) / ((X**2).mean() - (X.mean())**2)
    return slope

calc_slope(y_test, y_pred)

price    0.707044
dtype: float64

In [22]:
#reassign to save space?
kc_data = pd.read_csv('dsc-phase-2-project/data/kc_house_data.csv')

# same drops
kc_data = kc_data.drop('date', axis=1).copy() 
kc_data = kc_data.replace({ '?' : '0'}).copy()
kc_data = kc_data.fillna('0').copy()


kcy = kc_data['price'].to_frame()
kcx = kc_data.drop('price', axis=1)


In [24]:
#Trying KNN
# maybe try 60% train , 20% adj, 20% test?
cv = KFold(n_splits=10, random_state=0)
classifier_pipeline = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=10))
k_pred = cross_val_predict(classifier_pipeline, kcx, kcy, cv=5)



In [None]:
print('RMSE:  ' + str(round(sqrt(mean_squared_error(kcy, k_pred)), 2)))
print('R Squared: ' + str(round(r2_score(kcy, k_pred), 2)))
print('Slope: ' + str(calc_slope(kcy, k_pred)))

In [None]:
kc_data.var()

In [None]:
fig_dims = (12, 8)
fig, ax = plt.subplots(figsize = fig_dims)
sns.heatmap(kc_data.corr(), ax=ax)
plt.show()

In [25]:
#reassign to save space?
kc_data = pd.read_csv('dsc-phase-2-project/data/kc_house_data.csv')

# same drops
kc_data = kc_data.drop('date', axis=1).copy() 
kc_data = kc_data.replace({ '?' : '0'}).copy()
kc_data = kc_data.fillna('0').copy()


kcy = kc_data['price'].to_frame()
kcx = kc_data.drop('price', axis=1)


In [26]:
abs(kc_data.corr()['price'])

id               0.016772
price            1.000000
bedrooms         0.308787
bathrooms        0.525906
sqft_living      0.701917
sqft_lot         0.089876
floors           0.256804
condition        0.036056
grade            0.667951
sqft_above       0.605368
yr_built         0.053953
zipcode          0.053402
lat              0.306692
long             0.022036
sqft_living15    0.585241
sqft_lot15       0.082845
Name: price, dtype: float64

In [27]:
abs(kc_data.corr())['price'][abs(kc_data.corr()['price'])>0.7].drop('price').index.tolist()

['sqft_living']

In [28]:
# rank features by correlation 
# 0.7 returning empty array, .65 best correlation we're getting 

vals = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65]

for val in vals:
    features = abs(kc_data.corr())['price'][abs(kc_data.corr()['price'])>val].drop('price').index.tolist()
    
    x = kc_data.drop(columns='price')
    x = x[features]
    
    y_pred = cross_val_predict(classifier_pipeline, x, kcy, cv=cv)
    
    print(features)
    print('RMSE:  ' + str(round(sqrt(mean_squared_error(kcy, y_pred)), 2)))
    print('R Squared: ' + str(round(r2_score(kcy, y_pred), 2)))

['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'grade', 'sqft_above', 'lat', 'sqft_living15']
RMSE:  195765.75
R Squared: 0.72
['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'grade', 'sqft_above', 'lat', 'sqft_living15']
RMSE:  195765.75
R Squared: 0.72
['bedrooms', 'bathrooms', 'sqft_living', 'grade', 'sqft_above', 'lat', 'sqft_living15']
RMSE:  197788.25
R Squared: 0.71
['bathrooms', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15']
RMSE:  233598.39
R Squared: 0.6
['bathrooms', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15']
RMSE:  233598.39
R Squared: 0.6
['sqft_living', 'grade', 'sqft_above']
RMSE:  239910.06
R Squared: 0.57
['sqft_living', 'grade']
RMSE:  244892.09
R Squared: 0.56


In [29]:
# feature selection using wrapper 
kc_data['sqft_living'] = kc_data['sqft_living'].astype('category')
dummies = pd.get_dummies(kc_data['sqft_living'])
kc_dum = kc_data.drop(columns='sqft_living').merge(dummies, left_index=True, right_index=True )

In [35]:
sfs1 = SFS(classifier_pipeline,
          k_features = 16,
          forward = True,
          scoring = 'neg_mean_squared_error',
          cv=cv)

In [37]:
kcx = kc_data.drop('price', axis=1)

In [38]:
sfs1.fit(kcx, kcy)


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

SequentialFeatureSelector(cv=KFold(n_splits=10, random_state=0, shuffle=False),
                          estimator=Pipeline(steps=[('standardscaler',
                                                     StandardScaler()),
                                                    ('kneighborsregressor',
                                                     KNeighborsRegressor(n_neighbors=10))]),
                          k_features=16, scoring='neg_mean_squared_error')

In [None]:
sfs1.subsets_

In [31]:
kcy = kc_data['price'].to_frame()
kcx = kc_data.drop('price', axis=1)
y_pred = cross_val_predict(classifier_pipeline, kcx, kcy, cv=cv)
print('RMSE:  ' + str(round(sqrt(mean_squared_error(kcy, y_pred)), 2)))
print('R Squared: ' + str(round(r2_score(kcy, y_pred), 2)))

KeyboardInterrupt: 

Note: you may need to restart the kernel to use updated packages.


ERROR: unknown command "update"

