# How to alter variables in a pipeline

Q: What if you want to log some variables in a pipeline, or do something to a variable that `sklearn` doesn't have a transformer for? 

A: `FunctionTransformer`!

In [1]:
# so these are probably already in your imports
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline


## A minimal example

In [2]:
# let's load silly data for a minimal example
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# then define a function (and FunctionTransformer will allow you to make a transformer out of it!)
# so any changes you want to make to the series (like addition, subtraction, log, etc...)
# define those changes as a function
def log_func(input_series):
    return np.log(input_series)

# so set up a transformer that logs the values in the column
log_transformer = FunctionTransformer(log_func)

# and put that transformed in a pipeline
mini_pipe = Pipeline([("log_func", log_transformer)])

# here - that applies and fits nicely!
mini_pipe.fit(titanic['pclass'])
mini_pipe.transform(titanic['pclass'])[:10] # just print 10...

# boom!

0    1.098612
1    0.000000
2    1.098612
3    0.000000
4    1.098612
5    1.098612
6    0.000000
7    1.098612
8    1.098612
9    0.693147
Name: pclass, dtype: float64

## Let's put that along side other transformations

Let's say you want to log the pclass variable but do something else to other variables.

This is where we used `ColumnTransformer`:


In [4]:
# imports - I know we did this above, I'm treating this as the below as a new file...

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# load data 

import seaborn as sns
titanic = sns.load_dataset('titanic')

# set up transformations - one for numbers, one for categoricals, one for the log func

def log_func(input_series):
    return np.log(input_series)
log_transformer = FunctionTransformer(log_func) # you might put this inside a "fuller" transformer that deals with missing variables too

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))])

# which vars get which trans?

num_features = ['age']
cat_features = ['class']
log_features = ['pclass']

# set up the the preprocessor to do all three transformers

preprocessor = ColumnTransformer(
    transformers=[
        ('log', log_transformer, log_features),
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])



In [6]:
# see? it works:
pd.DataFrame( 
             preprocessor.fit_transform(titanic)
).head()

Unnamed: 0,0,1,2,3,4
0,1.098612,-0.565736,0.0,0.0,1.0
1,0.0,0.663861,1.0,0.0,0.0
2,1.098612,-0.258337,0.0,0.0,1.0
3,0.0,0.433312,1.0,0.0,0.0
4,1.098612,0.433312,0.0,0.0,1.0
