In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from data_util import *

In [2]:
set_a = pd.read_csv('AmesHousingSetA.csv')
set_b = pd.read_csv('AmesHousingSetB.csv')

data_a_x = set_a[list(set_a)[:-1]]
data_a_y = set_a[list(set_a)[-1]]
#data_a_x.head()



In [3]:
print('Initial shape of the table: ' + str(data_a_x.shape) + 
      '\n')
print('Initial number of columns of each type: \n'+
      str(data_a_x.dtypes.value_counts()) +
      '\n')

Initial shape of the table: (2344, 80)

Initial number of columns of each type: 
object     43
int64      26
float64    11
dtype: int64



## I. Data Preprocessing
#### 1) What specific data transforms did you perform prior to exploration and analysis, and why did you choose these?

In [4]:
# Fill NaN values with the most common variable from that column.
# I can't use impute impute because sklearn.impute is not in the
# latest stable version of scikit

print('Filling NaNs...\n')
data_a_x = data_a_x.apply(lambda x : x.fillna(x.value_counts().index[0]))
data_a_x.head()

Filling NaNs...



Unnamed: 0,PID,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Street,Alley,Lot.Shape,Land.Contour,Utilities,...,Screen.Porch,Pool.Area,Pool.QC,Fence,Misc.Feature,Misc.Val,Mo.Sold,Yr.Sold,Sale.Type,Sale.Condition
0,526350040,20,RH,80.0,11622,Pave,Grvl,Reg,Lvl,AllPub,...,120,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
1,526351010,20,RL,81.0,14267,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Gar2,12500,6,2010,WD,Normal
2,526353030,20,RL,93.0,11160,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,4,2010,WD,Normal
3,527105010,60,RL,74.0,13830,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,3,2010,WD,Normal
4,527105030,60,RL,78.0,9978,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal


In [5]:
# PID is a catagorical label that the program may treat as a number
# this could cause the model to correlate a pattern that doesn't exist
print('Deleting PID...\n')
del data_a_x['PID']

Deleting PID...



In [6]:
# One hot encode categorical features
# because we can't have those here
print('Transforming categorical data...\n')
data_a_x = pd.get_dummies(data_a_x, columns=cat_features(data_a_x))

Transforming categorical data...



In [7]:
print('New table shape: ' + str(data_a_x.shape) +
     '\n')
print('New column types: \n' +
     str(data_a_x.dtypes.value_counts()) +
     '\n')

New table shape: (2344, 298)

New column types: 
uint8      262
int64       25
float64     11
dtype: int64



## II. Exploratory Analysis Questions
Perform an exploratory analysis on your data by visualizing and/or applying other means of data exploration.

#### 1) What (if any) insights jump out at you?

I tried to print the KDE model and it almost crashed my computer.

#### 2) Do you have any hypotheses about relationship of certain variables to the price?

Houses with pools probably tend to cost more

## III. Model Building
First construct a baseline model (containing all predictors) to predict the price. Then build the best model you can devise. In this part use ONLY dataset A and DO NOT TOUCH dataset B. You will want to split this into training and test sets and apply error metrics/compare models only on the test data.

In [9]:
# Split into train test sets
print('Spliting base model training and testing data...\n')
a_x_train, a_x_test, a_y_train, a_y_test = train_test_split(data_a_x, data_a_y, test_size=.2, random_state=4)

Spliting base model training and testing data...



In [10]:
# Build a base model
print('Building base model...\n')
base_model = linear_model.LinearRegression()
base_model.fit(a_x_train, a_y_train)
preds = base_model.predict(a_x_test)
print('Regression error report: ')
print_regression_error_report(a_y_test, preds)

Building base model...

Regression error report: 
MSE, MAE, R^2, EVS: [668254290.023182, 11380.980722750304, 0.9014204575625123, 0.902199088676741]


#### 1) What approach did you use to arrive at the best model? Why did you select this approach?

In [11]:
# I'm using recursive feature elimination with cross validation
# because I don't want to have to go through and find the optimal
# number of features myself

print('Selecting features with RFECV...\n')
selector_f = RFECV(estimator = linear_model.LinearRegression(), 
                   scoring = make_scorer(r2_score), 
                   cv=5)
selector_f.fit(a_x_train, a_y_train)

# Get cols
axt_train, axt_test = selector_f.transform(a_x_train), selector_f.transform(a_x_test)

print('Building model...\n')
# Create models
model = linear_model.LinearRegression()
model.fit(axt_train, a_y_train)

# Makes predictions
print('Testing model...\n')
preds = model.predict(axt_test)
print('Regression error report: ')
print_regression_error_report(preds, a_y_test)

Selecting features with RFECV...

Building model...

Testing model...

Regression error report: 
MSE, MAE, R^2, EVS: [668254290.023182, 11380.980722750304, 0.8864990710646194, 0.8873955588406606]


#### 2) Which error metric(s) are you using to compare performance? What is the value(s) of the error metric(s) for the baseline model and your best model?

I'm primarily using R^2 but it's lower for RFECV (.8865) than it is for the baseline model(.9014). The high MSE's imply that this is likely due to outliers in the data but I can't find them.

## IV. Predicting and Validating
Run your baseline and best models on dataset B. DO NOT do any further training. Remember to apply all transforms you used in building the model to this set (use the transform function on the preprocessors you created in part I).

In [12]:
data_b_x = set_b[list(set_b)[:-1]]
data_b_y = set_b[list(set_b)[-1]]

print(data_b_x.shape)

(586, 80)


In [13]:
print('Deleting PID...\n')
del data_b_x['PID']

print('Filling NaNs...\n')
data_b_x = data_b_x.apply(lambda x : x.fillna(x.value_counts().index[0]))

print('Transforming categorical data...\n')
data_b_x = pd.get_dummies(data_b_x, columns=cat_features(data_b_x))
print(data_b_x.shape)

Deleting PID...

Filling NaNs...

Transforming categorical data...

(586, 269)


In [14]:
# for col in columns of data a x
#  if col not in dolumns of data b x
#   add a col of name col with modes from data a


for col in data_a_x.columns:
    if col not in data_b_x.columns:
        data_b_x[col] = [data_a_x[col].value_counts().index[0]] * data_b_x.shape[0]

# for col in cols of data b x
#  if col not in cols of data a x
#   remove them

for col in data_b_x.columns:
    if col not in data_a_x.columns:
        del data_b_x[col]

print(data_b_x.shape)


(586, 298)


In [16]:
# use the linear model
preds = base_model.predict(data_b_x)
print('Linear regression error report: ')
print_regression_error_report(data_b_y, preds)
# wow that was really bad

Linear regression error report: 
MSE, MAE, R^2, EVS: [89342538566.50015, 260867.23625385552, -14.904138110159526, -2.0120210115361163]


In [17]:
# Use the better model
print('Testing model...\n')
preds = model.predict(data_b_x)
print('Regression error report: ')
print_regression_error_report(preds, data_b_y)

Testing model...

Regression error report: 
MSE, MAE, R^2, EVS: [89342538566.50015, 260867.23625385552, -3.1694089815978037, 0.2103723338369563]


#### 1) What are the respective error metric values for each model on this set? How did your best model do on this data as compared to the baseline?

The base model got an R^2 of -14 and the best model got an R^2 of -3. This may be due to the columns that were not in B containing the most frequent values from a, skewing the predictions

#### 2) Is your best model a good model? Why or why not

No, it couldn't make predictions with any remote amount of accuracy.