In [2]:
# Let's make the code cells wider, we've got a big screen for a reason!
from IPython.core.display import display, HTML, Markdown
display(HTML("<style>.container { width:85% !important; }</style>"))

### Import and dataset

In [3]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification, load_iris

Let's make a fake numeric dataset for the purpose of our examples.

We'll do this by using scikit-learn's helpful `make_classification` function.

<mark>You can 'unpack' arguments into a function via a dictionary using the `**` notation (see below).</mark>

In [4]:
make_classification_dict = {'n_samples': 100000, 'n_features': 50}

sample_data = make_classification(**make_classification_dict)

Let's look at our X:

In [5]:
X = pd.DataFrame(sample_data[0])
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-2.388069,0.904675,0.765426,0.573775,-0.049434,-1.229598,-1.206298,-0.341459,-0.272875,1.638739,...,-1.045696,2.289716,-0.963053,0.849536,-1.394746,-0.342471,0.013916,0.193423,0.222609,0.287836
1,-0.647733,-1.048923,0.650805,-2.164396,-1.373018,-0.05549,0.795744,-1.886932,-1.322406,-0.554796,...,0.877677,-0.60036,-1.736006,-0.848874,0.101609,-0.240057,-1.730964,0.005355,0.016654,-1.231224
2,1.196932,-0.566312,1.600226,1.361517,-1.072411,0.059933,1.015081,-1.730238,1.292043,0.521457,...,-0.629627,-0.380323,-1.13714,-0.395493,-0.235472,-0.119447,1.351482,-0.305511,-1.601222,-0.259824
3,0.876862,1.278279,-0.629812,-0.903798,-0.071253,0.63175,1.383593,1.029372,0.591521,0.969519,...,0.365821,-1.380773,-1.178331,1.226804,0.168191,-0.967657,1.785998,0.21196,0.007668,0.60214
4,-0.78019,-0.253192,-0.228805,0.440978,-0.437337,1.571321,-0.653941,-1.219301,0.190205,2.064969,...,-0.194003,-1.717657,0.644678,1.350398,0.405357,-1.283523,-0.196359,-0.688351,0.018954,0.863356


Let's look at our y: <br>
<mark>You can continue code onto the next line with `\` (see below)</mark>

In [6]:
y = pd.Series(sample_data[1])\
.to_frame()
y.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


#### Pipelines
<img style="float: center;width:250px;height:150px;" src="images/simple_pipelines.gif">

Prior to pipelines, the best way of organising the steps in your model would be to define separate variables for all steps.


Disadvantages
* Polluting the namespace
* Potential data leakage

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #we instantiate the scaler object
X_train_scaled = scaler.fit_transform(X_train) #we fit and transform X into the scaler object using .fit_transform()

Now let's say we want to apply a logistic regression.

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train_scaled, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
logreg.score(X_test, y_test)

0.87248

Now, imagine we've got lots of steps, <br>
let's say we're applying a `VarianceThreshold()` or a `SelectKBest` <br>
We would have to do the following.

In [11]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest

k_best = SelectKBest(k = 'all')
X_train_k_best = k_best.fit_transform(X_train, y_train.values.ravel())

scaler = StandardScaler() #we instantiate the scaler object
X_train_scaled = scaler.fit_transform(X_train_k_best) #we fit and transform X into the scaler object using .fit_transform()

var_thres = VarianceThreshold(threshold = 0.0)
X_train_var_thresh = var_thres.fit_transform(X_train_scaled)

logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train_var_thresh, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Now we have to apply the same steps to the test data. <br>
But in a way which avoids data leakage. <br>
We have to use the _information learned from the train data_ and apply it to our test data.

In [12]:
X_test_k_best = X_test.iloc[:, k_best.get_support(indices=True)]

X_test_scaled = scaler.transform(X_test_k_best)

X_test_var_thresh = var_thres.transform(X_test_scaled)

In [13]:
logreg.score(X_test_var_thresh, y_test)

0.87268

Let's plot the anova p_values.

In [14]:
from matplotlib import pyplot as plt
pd.DataFrame(k_best.pvalues_).sort_values(0, ascending = False).plot(kind = 'bar', title= 'anova p_values', figsize = (16, 9))

<matplotlib.axes._subplots.AxesSubplot at 0x7ff9630bacc0>

The above isn't ideal, and doesn't scale very well with large processes.

What if we want cross-validation with some parameter tuning?

<div class="alert alert-block alert-warning">
<b>Warning:</b> The code cell below takes a while to run, so feel free to skip.
</div>

In [15]:
import numpy as np
from sklearn.model_selection import GridSearchCV

logreg_cv = GridSearchCV(cv = 5, estimator=LogisticRegression(solver = 'lbfgs'),
                         param_grid={'C': np.logspace(-4, 4, num = 9)})
logreg_cv.fit(X_train_scaled, y_train.values.ravel())

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [16]:
logreg_cv.best_estimator_.score(X_test_scaled, y_test)
logreg_cv_y_pred = logreg_cv.predict(X_test_scaled)
logreg_cv_y_scores = logreg_cv.predict_proba(X_test_scaled)

<div class="alert alert-block alert-danger">
<b>Oh no:</b> We have just commited <b>data leakage.<b>
</div>
    
But... how? <br>
We are using values scaled (`StandardScalar`) to the *whole population* in a cross-validation (`GridSearchCV`) with data that is less than the *whole population*. <br>

A pipeline could have prevented this, let's convert the example above into one such form.

<div class="alert alert-block alert-warning">
<b>Warning:</b> The code cell below takes a while to run, so feel free to skip.
</div>

In [None]:
log_reg_pipe = make_pipeline(
    SelectKBest(k='all')
    ,VarianceThreshold()
    ,StandardScaler()
    ,LogisticRegression(solver = 'lbfgs'))
# Notice the parameter grid syntax has changed to the form: 'estimatorname__parameter'
pg = {'logisticregression__C': np.logspace(-4, 4, num = 9)}

log_reg = GridSearchCV(cv = 5, estimator = log_reg_pipe, param_grid= pg)
log_reg.fit(X_train, y_train.values.ravel())

Now values are scaled **within each k-fold**.

In [None]:
log_reg.get_params()

In [None]:
log_reg.best_params_

We can score the pipeline as normal.

In [None]:
log_reg.score(X_test_scaled, y_test)

Let's apply this pipeline on a new dataset! <br>
Let's use the common [iris](https://en.wikipedia.org/wiki/Iris_flower_data_set) dataset.

In [None]:
load_iris()['target'].shape

In [None]:
iris_X = load_iris()['data']
iris_y = load_iris()['target']
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, stratify = iris_y)

First we clone the pipeline so we can still use the steps of the previously trained data without interfering with logreg. <br>
Think of this as the difference between `df2 = df1` and `df2 = df1.copy()`.

In [None]:
from sklearn.pipeline import clone

log_reg_iris = clone(log_reg)
# Setting the multi_class parameter to 'auto' due to this problem having three classes
log_reg_iris.estimator.named_steps['logisticregression'].multi_class = 'auto'
# Increasing max_iter so the SGD algorithm has more time to converge
log_reg_iris.estimator.named_steps['logisticregression'].max_iter = 1000
# Setting values to avoid future warnings
log_reg_iris.error_score = 'raise'
log_reg_iris.iid = False

In [None]:
# Then we fit as normal
log_reg_iris.fit(iris_X_train, iris_y_train)

In [None]:
log_reg_iris.score(iris_X_test, iris_y_test)

In [None]:
iris_log_reg_y_pred = log_reg_iris.predict(iris_X_test)
iris_log_reg_y_scores = log_reg_iris.predict_proba(iris_X_test)

It's as simple as that. <br>

[Scikit-Learn Pipeline Guide](https://scikit-learn.org/stable/modules/compose.html)

#### Pipelines as objects

Pipelines are objects, let's say we want to change the final step to SVM rather than Logistic, this is one way.

In [None]:
from sklearn.svm import SVC
svm_pipe = clone(log_reg_pipe)

We can access the steps in a pipelines two ways: <br>
`.named_steps` gives a dictionary. <br>
`.steps` gives a series of tuples.

In [None]:
svm_pipe.named_steps

In [None]:
svm_pipe.steps[-1]

Now let's redefine the last step in the pipeline.

In [None]:
svm_pipe.steps[-1] = ('svc', SVC(random_state=0, verbose=True))

We can see now that the pipeline has changed.

In [None]:
svm_pipe.named_steps

In [None]:
svm_pg = {"svc__kernel": ['rbf']
         ,"svc__C": [0.1]
         ,"svc__gamma": [0.001]}

svm_iris = GridSearchCV(cv = 3, estimator = svm_pipe, param_grid = svm_pg, iid=False)
svm_iris.fit(iris_X_train, iris_y_train.ravel())

In [None]:
svm_iris.score(iris_X_test, iris_y_test)
iris_svm_y_pred = svm_iris.predict(iris_X_test)

### Complex pipelines
What happens when we want to apply different pipelines to different columns of our data? <br>

<img style="float: left;" src="images/complex_pipelines.gif">

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
# Categorical Columns
fill_to_zero = [0]
# Define steps in pipeline
cat_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value=0))

Now we'll make a new pipeline.

In [None]:
preprocessing = make_column_transformer((cat_pipe, fill_to_zero), remainder="passthrough")

The below is a lot to take in, let's take it step by step. <br>
1. First we use C_base and C_list to create a list of potential `C` that we will grid search over. <br>
`C` is the inverse of the regularisation strength. <br>
1. Then the pipeline begins, we run the data through the "preprocessing" pipeline defined above. <br>
1. Then we impute with the median. <br>
1. Then we run the data through a Random Forest to see what the feature importances are. <br>
1. Only the top 20 features progress to the next stage. This can be useful when dealing with hundreds of features. <br>
1. Next all values are scaled such that the mean is 0 and the variance is 1 (unit variance). <br>
1. Finally the data is run through `LogisticRegressionCV`, think of this as `LogisticRegression` combined with `GridSearchCV`, this approach can be quicker due to a backend phenonmenon called *warm starting* (using the previous solution as an initialization for the following fit).

In [None]:
C_base = 4
C_list = np.logspace(-C_base, +C_base, num=(2 * C_base) + 1)
lr_pipe = make_pipeline(
    preprocessing,
    SimpleImputer(strategy="median"),
    SelectFromModel(RandomForestClassifier(random_state=0, n_estimators=20)),
    StandardScaler(),  # Adding in a standard scaling step, relative to the dtc section.
    LogisticRegressionCV(random_state=0, solver="saga", cv=5, penalty="l1", class_weight="balanced", scoring="accuracy", max_iter=10000, n_jobs=-1, multi_class='ovr')
)

In [None]:
lr_pipe.fit(iris_X_train, iris_y_train.ravel())

In [None]:
iris_y_train

In [None]:
iris_X_train.shape

In [None]:
lr_pipe.score(iris_X_test, iris_y_test)

**A quick note on feature names**
So we've applied a lot of steps here.
But interpretability is important, so it would be good to know what features went into the final step.

A simple `pipeline.get_feature_names()` is not yet implemented in scikit-learn. <br>
But the pipeline above is not too long, so we can do this manually.

Below is an example that works for a pipeline with many features, I'll talk through how each step works, but first here's the example pipeline.

**Preprocessing pipeline**

```
# Categorical Columns
categoric_variables = ["CITY"]
# Define steps in pipeline
cat_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="0"), OneHotEncoder(handle_unknown="ignore"))


# Column transformer to deal with the categoric variable
preprocessing = make_column_transformer((cat_pipe, categoric_variables), remainder="passthrough")

```

**Random Forest Pipeline**

```
# Random Forest
`rf_pipe = make_pipeline(preprocessing
                        ,SimpleImputer(strategy="median")
                        ,VarianceThreshold()
                        ,RandomForestClassifier(random_state=0))
rf_parameters = {"randomforestclassifier__class_weight": ["balanced"]
                ,"randomforestclassifier__n_estimators": [20, 50, 70, 100]}
                
rf_gs = GridSearchCV(rf_pipe
                    ,param_grid = rf_parameters
                    ,scoring=scoring_list
                    ,refit="roc_auc"
                    ,return_train_score=False
                    ,cv=5)
```


```
feature_names = list(pipe.named_steps.columntransformer.named_transformers_.pipeline.named_steps.onehotencoder.get_feature_name(categoric_variables)) +
[column for column in X.columns if column not in categoric_variables]

feature_names = [list(feature_names)[i] for i in pipe.named_steps.variancethreshold.get_support(indices=True)]
```

### Useful Packages
Below are some useful packages for machine learning that help abstract boilerplate code. <br>
1. eli5 - basic ML interpretability. 
1. scikitplot - matplotlib plots directly from model artefacts.

In [None]:
import eli5
eli5.explain_weights_sklearn(log_reg_iris.best_estimator_.named_steps.logisticregression)

In [None]:
from scikitplot.metrics import plot_confusion_matrix, plot_roc, plot_cumulative_gain
plot_confusion_matrix(iris_y_test, iris_log_reg_y_pred)
plot_roc(iris_y_test, iris_log_reg_y_scores, title = 'my fake roc curve ¯\_(-_-)_/¯', plot_micro = False, plot_macro = False)
plot_cumulative_gain(y_test, logreg_cv_y_scores)

In [None]:
test = plt.axes()

plot_roc(iris_y_test, iris_log_reg_y_scores, ax = test, plot_macro = False, plot_micro = False)
plot_roc(y_test, logreg_cv_y_scores, ax = test, plot_macro = False, plot_micro = False)