In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn stuff
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.datasets import make_regression

# model functionality and validation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# regressor models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor



In [4]:
# data pipeline

# read in file
raw_data = pd.read_csv("AmesHousing.csv", "~")

# see what has been loaded
raw_data.keys()

# select most likely key features
data = raw_data[["SalePrice", "1st Flr SF", "2nd Flr SF", "MS Zoning", "Lot Frontage", "Lot Area", "Land Contour", "Bldg Type", "House Style", "Overall Qual", "Overall Cond", "Year Built", "Year Remod/Add", "Bsmt Qual", "Bsmt Cond", "Heating", "Central Air", "Full Bath", "Half Bath", "Bedroom AbvGr", "Fireplaces", "Garage Type", "Garage Cars", "Pool Area", "Pool QC", "Yr Sold"]]

# ensure we've loaded what we want
data.keys()

# convert to a simplified dataframe for initial work
simple_data = data[["SalePrice", "1st Flr SF", "2nd Flr SF", "Year Built", "Full Bath", "Half Bath", "Bedroom AbvGr", "Garage Cars", "Yr Sold"]]

# fill in nan for 2nd floor square feet with zeros, which makes sense
simple_data.loc[:, "2nd Flr SF"].fillna(0, inplace=True)

# calculate simple square footage
square_feet = simple_data.loc[:,"1st Flr SF"] + simple_data.loc[:,"2nd Flr SF"]

# add this to the data frame then clean up the data frame some
simple_data = pd.concat((simple_data, square_feet), axis=1)
simple_data.drop(0, axis=1, inplace=True)
simple_data.drop("1st Flr SF", axis=1, inplace=True)
simple_data.drop("2nd Flr SF", axis=1, inplace=True)

# fix nans
simple_data["Garage Cars"].fillna(0, inplace=True)

# get Y labels
Y = simple_data.loc[:,"SalePrice"]

simple_data.pop("SalePrice")
print simple_data.describe()

        Year Built    Full Bath    Half Bath  Bedroom AbvGr  Garage Cars  \
count  2930.000000  2930.000000  2930.000000    2930.000000  2930.000000   
mean   1971.356314     1.566553     0.379522       2.854266     1.766212   
std      30.245361     0.552941     0.502629       0.827731     0.761137   
min    1872.000000     0.000000     0.000000       0.000000     0.000000   
25%    1954.000000     1.000000     0.000000       2.000000     1.000000   
50%    1973.000000     2.000000     0.000000       3.000000     2.000000   
75%    2001.000000     2.000000     1.000000       3.000000     2.000000   
max    2010.000000     4.000000     2.000000       8.000000     5.000000   

           Yr Sold  
count  2930.000000  
mean   2007.790444  
std       1.316613  
min    2006.000000  
25%    2007.000000  
50%    2008.000000  
75%    2009.000000  
max    2010.000000  


In [69]:
class RandomForestTransformer(RandomForestRegressor, TransformerMixin):
    '''
    Implement a transform method for Random Forests so we can pipeline it
    '''
    def transform(self, X, *_):
        return self.predict(X)
    

class GradientBoostingTransformer(GradientBoostingRegressor, TransformerMixin):
    '''
    Implement a transform method for Gradient Boosting Regressor so we can pipeline it
    '''
    def transform(self, X, *_):
        return self.predict(X)
    
class SVRTransformer(SVR, TransformerMixin):
    '''
    Implement a transform method for Support Vector Regressor so we can pipeline it
    '''
    def transform(self, X, *_):
        return self.predict(X)
    
class DecisionTreeTransformer(DecisionTreeRegressor, TransformerMixin):
    '''
    Implement a transform method for a Decision Tree regressor so we can pipeline it
    '''
    def transform(self, X, *_):
        return self.dense(self.predict(X))
    
    def dense(self, X, *_):
        return X.todense()
    
class LinearTransformer(LinearRegression, TransformerMixin):
    '''
    Implement a transform method for Linear regressor so we can pipeline it
    '''
    def transform(self, X, *_):
        return self.predict(X)



In [70]:
def build_model():
    '''
    Chain together 5 models for a first layer ensemble
    Do a feature union of the predictions from the first layer
    Then train another model on the results of the first layer
    '''
    pred_union = FeatureUnion(
        transformer_list=[
            ("linear", LinearTransformer()),
            ("decision_tree", DecisionTreeTransformer())
#             ("support_vector", SVRTransformer())
#             ("gradient_boosting", GradientBoostingTransformer()),
#             ("random_forests", RandomForestTransformer())
        ],
        n_jobs=1
    )
    
    model = Pipeline(steps=[
        ('pred_union', pred_union),
        ('linear', LinearRegression())
    ])
    
    return model

In [71]:
# split the data for train and test
X_train, X_eval, y_train, y_eval = train_test_split(simple_data.values, Y, random_state=1337, test_size=0.25)

y_train = np.array(y_train).reshape(-1, 1)
y_eval = np.array(y_eval).reshape(-1, 1)

print "y_train", y_train.shape
print "x_train", X_train.shape
print "y_test", y_eval.shape
print "x_test", X_eval.shape

# build the ensemble
model = build_model()

model.fit(X_train, y_train)
score = model.score(X_eval, y_eval)

print "Score:", score

y_train (2197, 1)
x_train (2197, 6)
y_test (733, 1)
x_test (733, 6)


AttributeError: 'numpy.ndarray' object has no attribute 'todense'