In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# get rid of float conversion warning during standard scaler
import warnings
warnings.filterwarnings("ignore")

**Pipeline()** is to **make_pipeline()** as
**FeatureUnion()** is to **make_union()**

**Pipelines** stick things on top of each other (one transformer after the prior)

**FeatureUnions** stick things next to each other (french fries - the output of the transformer gets stuck NEXT to the prior transformer)

Bonus: [FunctionTransformer()](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer) - makes a function into a fit/transform class object
    
1. **Pipeline():**
    * takes a list of tuples
    * ('name_of_pipeline_object', Class_that_transforms())
2. **make_pipeline():**
    * takes a comma-separated list of steps (must be fit/transformable classes)
    * returns a pipeline, with each step named the lower-case version of itself, i.e. StandardScaler() becomes 'standardscaler'
3. **FeatureUnion():**
 * take a list of pipelines OR fit/transformable classes

In [2]:
# read in the housing data set
df = pd.read_csv('./datasets/train.csv')

## Define transformers

Transformers have .fit() and .transform() methods and can be placed into pipeline (up/down) or union (side/side) functions

In [3]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Accepts a single column and returns the column as a numpy array
    Richard method
    """
    def __init__(self, column):
        self.column = column 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[[self.column]].values
    
class FeaturesExtractor(BaseEstimator, TransformerMixin):
    """
    Accepts a list of columns and returns the columns as a numpy array
    Greg method
    FeaturesExtractor(['colHeader1', 'colHeader2']).transform(df)
    NOTE: this doens't work, will complain about dissimilar array sizes during .fit() of pipe, see
    https://github.com/scikit-learn/scikit-learn/issues/2034    
    """
    def __init__(self, columns):
        self.columns = columns 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        if len(self.columns) > 1:
            return X[self.columns].values
        else:
            return X[self.columns[0]].values

class CategoricalExtractor(BaseEstimator, TransformerMixin):
    """
    One-hot-encodes a categorical (string) column. Not used in this notebook.
    """
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)
    
class YesNoEncoder(BaseEstimator, TransformerMixin):
    """
    Takes a list of columns and returns Y/N 1/0 encoded df
    Example: YesNoEncoder(['Column Name']).transform(df)
    Returns a transformed column, if 'Y' then 1, else 0
    """
    def __init__(self, columns):
        self.columns = columns 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        if len(self.columns) > 1:
            return X[self.columns].apply(lambda x: 1 if x == 'Y' else 0).values
        else:
            return X[self.columns[0]].apply(lambda x: 1 if x == 'Y' else 0).values.reshape(-1, 1)
        
class MakeFloat(BaseEstimator, TransformerMixin):
    """
    Takes a column name and returns a float
    """
    def __init__(self, column):
        self.column = column 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return pd.to_numeric(X[self.column], downcast='float')
#         return X[self.columns[0]].apply(lambda x: 1 if x == 'Y' else 0).values.reshape(-1, 1)

## Create an 'X' matrix using make_union(), then again using FeatureUnion()

### make_union method

In [7]:
# Transform Central Air Y/N to 1/0 and tack on the remaining features on the RIGHT
# note that the result of the union is another transformer class object
# The labels of the unions are made automatically, i.e. 'yesnoencoder'

combine_features_union = make_union(
    YesNoEncoder(['Central Air']),
    FeatureExtractor('Lot Area'),
    FeatureExtractor('Year Built'),
    FeatureExtractor('Overall Qual')
)

# display the union'd array of Central Air and the other factors (like a list of columns)
combine_features_union.get_params()['transformer_list']

[('yesnoencoder', YesNoEncoder(columns=['Central Air'])),
 ('featureextractor-1', FeatureExtractor(column='Lot Area')),
 ('featureextractor-2', FeatureExtractor(column='Year Built')),
 ('featureextractor-3', FeatureExtractor(column='Overall Qual'))]

### FeatureUnion() method

In [8]:
# allows naming of the unions (and weighting of transformers), takes a list of tuples
combine_features_union = FeatureUnion([
    ('yne', YesNoEncoder(['Central Air'])),
    ('la', FeatureExtractor('Lot Area')),
    ('yb', FeatureExtractor('Year Built')),
    ('oq', FeatureExtractor('Overall Qual'))
])

# display the union'd array of Central Air and the other factors (like a list of columns)
combine_features_union.get_params()['transformer_list']

[('yne', YesNoEncoder(columns=['Central Air'])),
 ('la', FeatureExtractor(column='Lot Area')),
 ('yb', FeatureExtractor(column='Year Built')),
 ('oq', FeatureExtractor(column='Overall Qual'))]

## Make the pipeline

### No grid search, make_pipeline() method 

In [9]:
pipe = make_pipeline(
    combine_features_union,
    StandardScaler(),
    RandomForestClassifier()
)

# print out the steps    
[print('Step {} NAME: {}\n Step {} TRANSFORMER: {}\n'
       .format(i+1, step[0], i+1, step[1])) for i, step in enumerate(pipe.steps)]

Step 1 NAME: featureunion
 Step 1 TRANSFORMER: FeatureUnion(n_jobs=1,
       transformer_list=[('yne', YesNoEncoder(columns=['Central Air'])), ('la', FeatureExtractor(column='Lot Area')), ('yb', FeatureExtractor(column='Year Built')), ('oq', FeatureExtractor(column='Overall Qual'))],
       transformer_weights=None)

Step 2 NAME: standardscaler
 Step 2 TRANSFORMER: StandardScaler(copy=True, with_mean=True, with_std=True)

Step 3 NAME: randomforestclassifier
 Step 3 TRANSFORMER: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



[None, None, None]

### No grid search, Pipeline() method 

In [10]:
# note the names are different, as we've named them 'cfu' and 'ss'
pipe = Pipeline([
    ('cfu', combine_features_union),
    ('ss', StandardScaler()),
    ('clf', RandomForestClassifier())
])

# print out the steps    
[print('Step {} NAME: {}\n Step {} TRANSFORMER: {}\n'
       .format(i+1, step[0], i+1, step[1])) for i, step in enumerate(pipe.steps)]

Step 1 NAME: cfu
 Step 1 TRANSFORMER: FeatureUnion(n_jobs=1,
       transformer_list=[('yne', YesNoEncoder(columns=['Central Air'])), ('la', FeatureExtractor(column='Lot Area')), ('yb', FeatureExtractor(column='Year Built')), ('oq', FeatureExtractor(column='Overall Qual'))],
       transformer_weights=None)

Step 2 NAME: ss
 Step 2 TRANSFORMER: StandardScaler(copy=True, with_mean=True, with_std=True)

Step 3 NAME: clf
 Step 3 TRANSFORMER: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



[None, None, None]

## Split and fit, no grid search

In [15]:
# drop all but the target y from the df
X = df.drop('Sale Condition', axis=1)

# select the target y
y = df['Sale Condition']

# train test split time
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# do 5 fold CV, using the pipeline backend
print(cross_val_score(pipe, X_train, y=y_train, cv=5).mean())

0.828956893287


## Let's add in grid searching

In [21]:
pipeGS = Pipeline([
    ('cfu', combine_features_union),
    ('ss', StandardScaler()),
    ('clf_gs', GridSearchCV(RandomForestClassifier(),param_grid={}))
])

# define the grid search that controls the named pipeline objects
grid = {
    # clean up the ss
    'ss':[StandardScaler()],
    'clf_gs':[GridSearchCV(KNeighborsClassifier(),
                         param_grid={'n_neighbors':[3,6,9]}),
            GridSearchCV(RandomForestClassifier(),
                         param_grid={'n_estimators':np.arange(20,120,20),
                                     'max_depth':np.arange(5,30,5)})
           ]
}

# create grid search object with above grid searching parameters and fit
gs = GridSearchCV(pipeGS,param_grid=grid)
gs.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('cfu', FeatureUnion(n_jobs=1,
       transformer_list=[('yne', YesNoEncoder(columns=['Central Air'])), ('la', FeatureExtractor(column='Lot Area')), ('yb', FeatureExtractor(column='Year Built')), ('oq', FeatureExtractor(column='Overall Qual'))],
       transformer_weights=None)), ('ss', Stand...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ss': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'clf_gs': [GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weight...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
  

In [36]:
print('Training score: {}\nTesting score: {}\nParams of best model: {}'
      .format(
        gs.score(X_train, y_train), 
        gs.score(X_test, y_test),
        gs.best_estimator_.get_params()['clf_gs'].best_estimator_
))

Training score: 0.8668122270742358
Testing score: 0.8375184638109305
Params of best model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [None]:
# GS for PCA
from sklearn.grid_search import GridSearchCV

#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA()),
    ('model', LinearRegression())
])

#create parameters
params = {
    'pca__n_components':[1,2,3]
}

gs = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs.fit(X, y)
gs.best_score_