In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer, Imputer, OneHotEncoder, StandardScaler
import numpy as np
import pandas as pd

In [35]:
'''Classes for Feature Extractor and Categorical Extractor. The genius of Richard making his own classes from Sklearn BaseClasses'''
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[[self.column]].values 

class CategoricalExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

In [36]:
df = pd.read_csv('datasets/train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,138500


In [37]:
'''Removing the space in the column names'''
df.columns = [col.replace(' ', '') for col in df.columns]

In [38]:
df.columns[0:5]

Index(['Id', 'PID', 'MSSubClass', 'MSZoning', 'LotFrontage'], dtype='object')

In [39]:
traindata_corr = df.corr()['SalePrice'][:-1]
positively_corelated = traindata_corr[traindata_corr.sort_values(ascending=False)>0]
positively_corelated_columns= list(positively_corelated.index)
positively_corelated_columns

['LotFrontage',
 'LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemod/Add',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MoSold']

In [40]:
positively_corelated.sort_values(ascending=False)

OverallQual      0.800207
GrLivArea        0.697038
GarageArea       0.650270
GarageCars       0.648220
TotalBsmtSF      0.628925
1stFlrSF         0.618486
YearBuilt        0.571849
YearRemod/Add    0.550370
FullBath         0.537969
GarageYrBlt      0.533922
MasVnrArea       0.512230
TotRmsAbvGrd     0.504014
Fireplaces       0.471093
BsmtFinSF1       0.423519
LotFrontage      0.341842
OpenPorchSF      0.333476
WoodDeckSF       0.326490
LotArea          0.296566
BsmtFullBath     0.283662
HalfBath         0.283001
2ndFlrSF         0.248452
BsmtUnfSF        0.190210
BedroomAbvGr     0.137067
ScreenPorch      0.134581
3SsnPorch        0.048732
MoSold           0.032735
PoolArea         0.023106
BsmtFinSF2       0.016255
Name: SalePrice, dtype: float64

For the purposes of this analysis I am going to take the two greatly correlated columns *OverallQual* and *GrLivArea*

In [41]:
X = df[['OverallQual', 'GrLivArea']]

In [42]:
X.head()

Unnamed: 0,OverallQual,GrLivArea
0,6,1479
1,7,2122
2,5,1057
3,5,1444
4,6,1445


In [43]:
y = df[['SalePrice']]

In [44]:
y.head()

Unnamed: 0,SalePrice
0,130500
1,220000
2,109000
3,174000
4,138500


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [52]:
'''Gonna create the pipeline'''

'''Creating the pipeline on the Overall Quality'''
quality_pipeline = make_pipeline(
    FeatureExtractor('OverallQual'), 
    StandardScaler() 
)

In [54]:
'''Creating the pipeline on the GrLivArea'''
livarea_pipeline = make_pipeline(
    FeatureExtractor('GrLivArea'), 
    StandardScaler() 
)

In [59]:
'''Performing Feature Union'''
feature_un = make_union(quality_pipeline, livarea_pipeline) 

In [64]:
'''Feeding this to a model pipeline'''

feature_un.fit(X_train)



FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('featureextractor', FeatureExtractor(column='GrLivArea')), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('pipeline-2', Pipeline(steps=[('featureextractor', FeatureExtractor(column='GrLivArea')), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
       transformer_weights=None)

In [66]:
feature_un.transform(X_train)[0:5]



array([[-1.03478811, -1.03478811],
       [ 2.36837038,  2.36837038],
       [ 0.00377245,  0.00377245],
       [ 0.61516678,  0.61516678],
       [-0.13186801, -0.13186801]])

In [67]:
model = Pipeline([
    ('Features', feature_un), 
    ('Linear Reg', LinearRegression())
])

In [68]:
model.fit(X_train, y_train)



Pipeline(steps=[('Features', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('featureextractor', FeatureExtractor(column='GrLivArea')), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('pipeline-2', Pipeline(steps=[('featureextractor', FeatureE...one)), ('Linear Reg', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [71]:
model.score(X_train, y_train)



0.48203131864834459

In [72]:
model.score(X_test, y_test)



0.50046518860331823

The model performance is really bad. However the main goal of this lab was to practice pipelines