In [1]:
%matplotlib inline

## [Column (Feature) Transformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer)
- Most datasets contain features with different datatypes, so preprocessing seems necessary before handing a dataset to scikit-learn.
- This can be problematic.
    - Incorporated test data stats into a preprocessor introduces the risk of data leakage into the model.
    - You may want to include preprocessor params in a parameter searc.
- Column Transformers solve these problems in a pipeline. They can be used in arrays, sparse matrices and Pandas dataframes.

In [2]:
import pandas as pd
X = pd.DataFrame(
    {'city':          ['London', 'London', 'Paris', 'Sallisaw'],
     'title':         ["His Last Bow", 
                       "How Watson Learned the Trick",
                       "A Moveable Feast", 
                       "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating':   [4, 5, 4, 3]
    }
)

In [3]:
# encode 'city' as categorical variable using OneHotEncoder
# -- expects 2D array - provide column as list of strings (['city'])
# apply CountVectorizer to 'title'
# -- expects 1D array

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

column_trans = ColumnTransformer(
    [('city_category', OneHotEncoder(dtype='int'),
      ['city'  ]),
     ('title_bow',     CountVectorizer(),          
      'title')],
    remainder='drop')

column_trans.fit(X)
print(column_trans.get_feature_names())
print(column_trans.transform(X).toarray())

['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw', 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', 'title_bow__wrath']
[[1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0]
 [0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1]]


In [4]:
# columns can be a list, integer array, slice, boolean mask or make_column.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector

ct = ColumnTransformer([
      ('scale', 
       StandardScaler(),
       make_column_selector(dtype_include=np.number)),

      ('onehot',
       OneHotEncoder(),
       make_column_selector(pattern='city', 
                           dtype_include=object))])
ct.fit_transform(X)

array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],
       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],
       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],
       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])

In [5]:
# strings can reference columns if input is a DataFrame
# (integers are interpreted as column indexes)
# keep remaining columns with remainder='passthrough'

column_trans = ColumnTransformer(
    [('city_category', 
      OneHotEncoder(dtype='int'),
      ['city']),
     
     ('title_bow', 
      CountVectorizer(), 
      'title')],
    
    remainder='passthrough')

column_trans.fit_transform(X)

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]])

In [6]:
# make_column_transformer example

from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
    (OneHotEncoder(), ['city']),
    (CountVectorizer(), 'title'),
    #remainder=MinMaxScaler()) <-- not defined in v0.23?
)
column_trans

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['city']),
                                ('countvectorizer', CountVectorizer(),
                                 'title')])

## HTML Visualization in Jupyter notebooks

In [7]:
from sklearn import set_config
set_config(display='diagram')   
# diplays HTML representation in a jupyter context
column_trans 

In [8]:
# HTML can be written to a file:
from sklearn.utils import estimator_html_repr
from sklearn.svm import SVC

clf = SVC(kernel="linear")

with open('example_estimator_in_html_format.html', 'w') as f:  
    f.write(estimator_html_repr(clf))

In [9]:
!ls *.html

example_estimator_in_html_format.html


In [10]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X,y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [11]:
# train as follows:
# age = float, fare = float
# embarked = category {'C','S','Q'}
# sex = category {'female','male'}
# pclass = ordinal {1,2,3}

In [12]:
# create pipelines
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', 
     SimpleImputer(strategy='median')),
    ('scaler', 
     StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']

categorical_transformer = Pipeline(steps=[
    ('imputer', 
     SimpleImputer(strategy='constant', 
                   fill_value='missing')),
    ('onehot', 
     OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 
         numeric_transformer, 
         numeric_features),
        ('cat', 
         categorical_transformer, 
         categorical_features)])

# Append classifier to preprocessing pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [13]:
# HTML representation
from sklearn import set_config
set_config(display='diagram')
clf

In [14]:
# example of auto-preprocessing clean data
# by using datatypes to decide whether to treat as number or category.

subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare']
X = X[subset_feature]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 5 columns):
embarked    1307 non-null category
sex         1309 non-null category
pclass      1309 non-null float64
age         1046 non-null float64
fare        1308 non-null float64
dtypes: category(2), float64(3)
memory usage: 33.6 KB


In [15]:
# 'embarked','sex' tagged as categories by fetch_openml
# we can use this to send category data to `categorical_transformer`,
# and remaining data to `numerical_transformer`.

from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(transformers=[
    ('num', 
     numeric_transformer, 
     selector(dtype_exclude="category")),
    ('cat', 
     categorical_transformer, 
     selector(dtype_include="category"))
])

# Reproduce the identical fit/score process
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.794


In [16]:
# using grid search on selected preprocessing steps

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best logistic regression from grid search: 0.794
