In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, accuracy_score, mean_squared_error
%matplotlib inline

In [2]:
df = pd.read_csv("train.csv")
y = df['SalePrice'].values

In [3]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df = obj_df.fillna({"Alley": 0, "MasVnrArea": 0, "MasVnrType": 0, "BsmtQual": 0, "BsmtCond": 0, "BsmtExposure": 0, "BsmtFinType1": 0, "BsmtFinType2": 0, "PoolQC": 0, "Fence": 0, "MiscFeature": 0, "GarageType": 0, "GarageFinish": 0, "GarageQual": 0, "GarageCond": 0, "FireplaceQu": 0, "Electrical": "SBrkr"})


Prepping the data for K-Nearest Neighbors

In [4]:
for item in df.items():
    if df[item[0]].dtype == "object":
        df[item[0]] = df[item[0]].astype('category')
        df[item[0]] = df[item[0]].cat.codes
df = df.fillna({'LotFrontage': -1, "MasVnrArea": 0, "GarageYrBlt": -1})
df.head()
df = df.drop(labels=["SalePrice", "Id"], axis=1)
for column in list(df):
    if -1 in df[column]:
        df[column] = df[column] + 1

In [5]:
df.columns[df.isnull().any()]

Index([], dtype='object')

Running K-Nearest Neighbors:

In [6]:
X_train = df.values
y_train = y
reg = KNeighborsRegressor()
gridsearch = GridSearchCV(reg, {"n_neighbors": [1, 3, 5, 7, 9, 11], "weights": ['uniform', 'distance'], 'p': [1, 2, 3]}, scoring='neg_mean_squared_error')
gridsearch.fit(X_train, y_train)
print("Best Params: {}".format(gridsearch.best_params_))
y_pred_train = gridsearch.predict(X_train)


Best Params: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}


In [7]:
gridsearch.score(X_train, y)

-0.0

I got a R^2 = 0 so I'm going to choose a different model

# Linear Regression

In [8]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Y = df['SalePrice'].values
df = df.drop(labels=["SalePrice", "Id"], axis=1)
test = test.drop(labels=["Id"], axis=1)
num_df = df.select_dtypes(exclude=['object']).copy()
num_test = test.select_dtypes(exclude=['object']).copy()

In [9]:
obj_test = test.select_dtypes(include=['object']).copy()
obj_df = df.select_dtypes(include=['object']).copy()
object_names = [item[0] for item in obj_df.items()]
object_t_names = [item[0] for item in obj_test.items()]
dummified = pd.get_dummies(obj_df, columns=object_names)
test_dumm = pd.get_dummies(obj_test, columns=object_t_names)



There are many missing values for "LotFrontage." I looked up what lot frontage meant and it's hard to understand a scenario where a house has 0 lot frontage. It's the length of property that is adjacent to a street.

So I want to regress "LotArea" on "LotFrontage" over the non-missing values to see if I can predict well what the "LotFrontage" would be for those with missing values.

In [10]:
from sklearn import linear_model
reg_frontage = linear_model.LinearRegression()
temp = df.copy()
temp = temp.dropna('rows', 'any', subset=['LotFrontage'])
X = temp['LotArea'].values
y = temp['LotFrontage'].values
reg_frontage.fit(X.reshape(-1,1),y.reshape(-1,1))
reg_frontage.score(X.reshape(-1,1),y.reshape(-1,1))



0.18155696502214747

Regressing lot area on lot frontage didn't predict well enough so instead of using that to guess what the lot frontage would have been, I'll just fill lot frontage with random values from the dataset.

In [11]:
df = pd.concat([num_df, dummified], axis=1)
test = pd.concat([num_test, test_dumm], axis=1)

In [12]:
df = df.fillna(method='ffill')
test = test.fillna(method='ffill')

In [13]:
linear = linear_model.LinearRegression()
linear.fit(df.values, Y)
linear.score(df.values, Y)



0.93325245783166055

I got a R^2 = .933 
I'm going to use linear regression.

In [14]:
# Get missing columns in the training test
missing_cols = set( df.columns ) - set( test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test = test[df.columns]
linear.predict(test.values)

array([ 112635.42725312,  159149.58647423,  186576.56147083, ...,
        175875.71209857,  116070.88545564,  223328.18465103])