# Playground for Pipeline Slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

Great documentation can be found on scikit-learn's website:

https://scikit-learn.org/stable/modules/compose.html

In [1]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.1.


In [3]:
import os
os.getcwd()

'C:\\Users\\st50\\Documents\\869_course\\pipeline'

In [4]:
df = pd.read_csv('../data/GermanCredit.csv')
df['Class'] = df['Class'].map({'Good': 1, 'Bad': 0})
df.info()
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 62 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   Duration                                1000 non-null   int64
 1   Amount                                  1000 non-null   int64
 2   InstallmentRatePercentage               1000 non-null   int64
 3   ResidenceDuration                       1000 non-null   int64
 4   Age                                     1000 non-null   int64
 5   NumberExistingCredits                   1000 non-null   int64
 6   NumberPeopleMaintenance                 1000 non-null   int64
 7   Telephone                               1000 non-null   int64
 8   ForeignWorker                           1000 non-null   int64
 9   Class                                   1000 non-null   int64
 10  CheckingAccountStatus.lt.0              1000 non-null   int64
 11  CheckingAccountSta

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,1,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,1,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,1,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,0,1,0,0,1,0,0,1,0


In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
Amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
InstallmentRatePercentage,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
ResidenceDuration,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
Age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
NumberExistingCredits,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
NumberPeopleMaintenance,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0
Telephone,1000.0,0.596,0.490943,0.0,0.0,1.0,1.0,1.0
ForeignWorker,1000.0,0.963,0.188856,0.0,1.0,1.0,1.0,1.0
Class,1000.0,0.7,0.458487,0.0,0.0,1.0,1.0,1.0


In [6]:
from sklearn.model_selection import train_test_split

X = df.drop(['Class'], axis=1)
y = df['Class']

feature_names = list(X.columns)
target_feature = 'Class'
numeric_features = ['Duration', 'Amount', 'InstallmentRatePercentage', 'ResidenceDuration', 'Age', 'NumberExistingCredits','NumberPeopleMaintenance']
bool_features = set(feature_names) - set(numeric_features) - set(target_feature)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape
y_train.shape
X_test.shape
y_test.shape

(800, 61)

(800,)

(200, 61)

(200,)

# A Very Simple Pipeline

In [7]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

pipe1 = make_pipeline(StandardScaler(), 
                      PCA(n_components=10), 
                      RFE(SVR(kernel="linear")), 
                      DecisionTreeClassifier(random_state=223))

param_grid = {
    'standardscaler__with_mean': [True, False],
    'pca__n_components': [5, 10, 20],
    'rfe__n_features_to_select': [None, 10, 20],
    'decisiontreeclassifier__max_depth': [None, 3, 10],
    'decisiontreeclassifier__criterion': ['gini', 'entropy'], 
    'decisiontreeclassifier__class_weight':[None, 'balanced'],
}

search = GridSearchCV(pipe1, param_grid, 
                      cv=3, n_jobs=5, scoring='f1_micro', return_train_score=True, verbose=2)

In [8]:
search = search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    3.2s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   25.1s
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:   57.6s
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  1.9min
[Parallel(n_jobs=5)]: Done 648 out of 648 | elapsed:  1.9min finished


In [9]:
# Predict testing data by just calling the pipe!
search.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1], dtype=int64)

In [10]:
search.score(X_test, y_test)

0.67

# A Pipeline That's A Little More Complicated...

Add a little feature selection, calass imbalance handling, and hyperparameter tuning...

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
dt = DecisionTreeClassifier(random_state=223)
rfe = RFE(estimator=dt, n_features_to_select=10)


def is_old(feature):
    feature = np.array(feature)
    return np.array([int(sample) > 60 for sample in feature]).reshape(-1, 1)

Column_trans = ColumnTransformer(
     [('scale', scaler, numeric_features),
      ('is_old', FunctionTransformer(is_old, validate=False), ['Age']),
      ('amount_log', FunctionTransformer(np.log10, validate=False), ['Amount']),
      ],
     remainder='passthrough')

# Since we are adding two new features, we better add them to our list of feature names
feature_names = feature_names + ['is_old', 'amount_log']

pipe2 = Pipeline([('features', Column_trans), ('rfe', rfe), ('dt', dt)])

param_grid = {
    'features__scale__with_mean': [True, False],
    'features__scale__with_std': [True, False],
    'rfe__n_features_to_select': [None, 5, 10, 20, len(feature_names)],
    'dt__max_depth': [None, 3, 10],
    'dt__criterion': ('gini', 'entropy'), 
    'dt__max_features':[None, 'auto'], 
    'dt__max_leaf_nodes':[None, 10],
    'dt__class_weight':[None, 'balanced'],
}

search = GridSearchCV(pipe2, param_grid, cv=3, n_jobs=3, scoring='f1_micro', return_train_score=True, verbose=2)

In [12]:
search.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 960 candidates, totalling 2880 fits


[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    5.3s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   17.8s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:   38.1s
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done 1007 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 1452 tasks      | elapsed:  3.0min
[Parallel(n_jobs=3)]: Done 1979 tasks      | elapsed:  4.1min
[Parallel(n_jobs=3)]: Done 2586 tasks      | elapsed:  5.4min
[Parallel(n_jobs=3)]: Done 2880 out of 2880 | elapsed:  6.0min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('features',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['Duration',
                                                                          'Amount',
                                                                          'InstallmentRatePercentage',
                                                                          'ResidenceDuration',
                                                                          'Age',
                                                                          'NumberExistingCredits',
                                                                          'NumberPeopleMaintenance']),
                      

In [13]:
search.score(X_test, y_test)

0.695

In [14]:
search.best_params_

{'dt__class_weight': None,
 'dt__criterion': 'entropy',
 'dt__max_depth': 10,
 'dt__max_features': None,
 'dt__max_leaf_nodes': None,
 'features__scale__with_mean': False,
 'features__scale__with_std': True,
 'rfe__n_features_to_select': 20}

In [15]:
# What did the features look like after preprocessings
feature_processing_obj = search.best_estimator_.named_steps['features']

features_train = feature_processing_obj.transform(X_train)
features_train.shape
features_train[0:10, 0:10]

(800, 63)

array([[5.08481926, 2.36020145, 2.67391255, 3.6212652 , 5.52415797,
        3.45741743, 2.85033418, 1.        , 3.83480205, 0.        ],
       [1.77968674, 0.80065933, 1.78260837, 0.9053163 , 2.89360656,
        1.72870872, 2.85033418, 0.        , 3.36530075, 1.        ],
       [0.50848193, 0.4267421 , 1.78260837, 3.6212652 , 4.38425236,
        1.72870872, 2.85033418, 0.        , 3.09201847, 1.        ],
       [1.77968674, 1.72733877, 0.89130418, 3.6212652 , 2.54286637,
        3.45741743, 2.85033418, 0.        , 3.6992305 , 0.        ],
       [1.01696385, 0.30590089, 3.56521674, 1.8106326 , 1.84138599,
        1.72870872, 2.85033418, 0.        , 2.94743372, 1.        ],
       [2.0339277 , 0.49786578, 3.56521674, 3.6212652 , 2.01675608,
        3.45741743, 2.85033418, 0.        , 3.15896526, 1.        ],
       [0.50848193, 1.02818606, 0.89130418, 1.8106326 , 2.80592151,
        1.72870872, 2.85033418, 0.        , 3.47392469, 0.        ],
       [2.0339277 , 0.81446975, 0.8913041

In [16]:
# Which features were selected by RFE?
rfe_obj = search.best_estimator_.named_steps['rfe']

for i in range(len(feature_names)):
    if rfe_obj.support_[i]:
        print('Feature {} ({}), Selected {}, Rank: {}'.format(i, feature_names[i], rfe_obj.support_[i], rfe_obj.ranking_[i]))

Feature 0 (Duration), Selected True, Rank: 1
Feature 1 (Amount), Selected True, Rank: 1
Feature 2 (InstallmentRatePercentage), Selected True, Rank: 1
Feature 4 (Age), Selected True, Rank: 1
Feature 5 (NumberExistingCredits), Selected True, Rank: 1
Feature 8 (ForeignWorker), Selected True, Rank: 1
Feature 9 (CheckingAccountStatus.lt.0), Selected True, Rank: 1
Feature 14 (CreditHistory.ThisBank.AllPaid), Selected True, Rank: 1
Feature 16 (CreditHistory.Delay), Selected True, Rank: 1
Feature 17 (CreditHistory.Critical), Selected True, Rank: 1
Feature 20 (Purpose.Furniture.Equipment), Selected True, Rank: 1
Feature 21 (Purpose.Radio.Television), Selected True, Rank: 1
Feature 22 (Purpose.DomesticAppliance), Selected True, Rank: 1
Feature 23 (Purpose.Repairs), Selected True, Rank: 1
Feature 26 (Purpose.Retraining), Selected True, Rank: 1
Feature 31 (SavingsAccountBonds.500.to.1000), Selected True, Rank: 1
Feature 36 (EmploymentDuration.4.to.7), Selected True, Rank: 1
Feature 51 (OtherInstal

In [17]:
# Print out the results of hyperparmater tuning

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    results['mean_fit_time'] = cv_results['mean_fit_time']
    results['mean_score_time'] = cv_results['mean_score_time']
    results['mean_train_score'] = cv_results['mean_train_score']
    results['std_train_score'] = cv_results['std_train_score']
    results['mean_test_score'] = cv_results['mean_test_score']
    results['std_test_score'] = cv_results['std_test_score']
    results['rank_test_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_test_score'], ascending=False)
    return results

results = cv_results_to_df(search.cv_results_)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results)


Unnamed: 0,dt__class_weight,dt__criterion,dt__max_depth,dt__max_features,dt__max_leaf_nodes,features__scale__with_mean,features__scale__with_std,rfe__n_features_to_select,mean_fit_time,mean_score_time,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
413,,entropy,10.0,,,False,True,20.0,0.434998,0.018668,0.930659,0.039278,0.72501,0.009541,1
403,,entropy,10.0,,,True,True,20.0,0.505333,0.031668,0.930659,0.039278,0.723757,0.008599,2
408,,entropy,10.0,,,True,False,20.0,0.438001,0.019335,0.930659,0.039278,0.723757,0.008599,2
418,,entropy,10.0,,,False,False,20.0,0.358331,0.021004,0.930659,0.039278,0.723757,0.008599,2
62,,gini,,auto,10.0,True,True,10.0,0.373,0.015999,0.749395,0.025837,0.713774,0.028016,5
67,,gini,,auto,10.0,True,False,10.0,0.341666,0.013334,0.749395,0.025837,0.713774,0.028016,5
72,,gini,,auto,10.0,False,True,10.0,0.352332,0.011,0.749395,0.025837,0.713774,0.028016,5
77,,gini,,auto,10.0,False,False,10.0,0.424336,0.012333,0.749395,0.025837,0.713774,0.028016,5
237,,gini,10.0,auto,10.0,False,False,10.0,0.364991,0.012334,0.749395,0.025837,0.713774,0.028016,5
232,,gini,10.0,auto,10.0,False,True,10.0,0.365087,0.012336,0.749395,0.025837,0.713774,0.028016,5
