# Salary investigation - Feature engineering and machine learning with Pipeline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# sklearn.set_config(print_changed_only=True)
%matplotlib inline

In [3]:
df = pd.read_csv("data/adult.csv", index_col=0)
df['income'] = df['income'].str.replace(" ", "")

# remove the column education-num because it is similar to education
df = df.drop(columns=['education-num'])
df_X = df[df.columns[:-1]]
y = df[df.columns[-1]]

print(df_X.dtypes)

kinds = np.array([dt.kind for dt in df_X.dtypes])
# print(kinds)

all_col = df_X.columns.values
is_num = kinds != 'O'
# print(is_num)
num_col = all_col[is_num]
print(num_col)

cat_col = all_col[~is_num]
print(cat_col)
cat_col
df_X.head()

LEncoder = LabelEncoder()
y_encode = LEncoder.fit_transform(y)

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
gender            object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object
['age' 'capital-gain' 'capital-loss' 'hours-per-week']
['workclass' 'education' 'marital-status' 'occupation' 'relationship'
 'race' 'gender' 'native-country']


All features:
* Workclass (categorical)
* education (categorical)
* marital-status (categorical)
* occupation (categorical)
* relationship (categorical)
* race (categorical)
* gender (categorical)

* native-country (categorical with engineering)


* age (numerical values)
* education-num (numerical values) (deleted)
* hours-per-week (numerical values)
* capital-gain (numerical values)
* capital-loss (numerical values)

## ColumnTransformer technique

The ColumnTransformer takes a list of three-item tuples. The first value in the tuple is a name that labels it, the second is an instantiated estimator, and the third is a list of columns you want to apply the transformation to. The tuple will look like this:

('name', SomeTransformer(parameters), columns)

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder

### Categorical feature processing

In [12]:
cat_si_step = ('si', SimpleImputer(strategy='constant',
                   fill_value='MISSING'))
cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                    handle_unknown='ignore'))

cat_steps = [cat_si_step, cat_ohe_step]
cat_pipe = Pipeline(cat_steps)
# cat_transformers = [('cat', cat_pipe, cat_col)]
# ct = ColumnTransformer(transformers=cat_transformers)


In [13]:
# Make numerical feature transformer
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())

num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, num_col)]
num_ct = ColumnTransformer(transformers=num_transformers)

# Combine both ColumnTransformer
comb_transformers = [('cat', cat_pipe, cat_col), \
                     ('num', num_pipe, num_col)]
comb_ct = ColumnTransformer(transformers=comb_transformers)

### Combining Columntransformer and ML

In [15]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
ml_pipe = Pipeline([('transform', comb_ct), ('dt', tree.DecisionTreeClassifier())])
# X_train, X_test, y_train, y_test = train_test_split(df_X, y_encode, test_size=0.2, random_state=42)
# ml_pipe.score(X_train, y_train)

### Perform Cross-validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=123)
cr_score = cross_val_score(ml_pipe, X, y_encode, cv=kf)
print(cr_score)

### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'transform__num__si__strategy': ['mean', 'median'],
    'dt__splitter': ['best', 'random'],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [2, 6, 10, 20],
    'dt__min_samples_split': [2, 6],
    'dt__min_samples_leaf': [1, 3],
    'dt__max_features': [None, 'log2'],
    }

gs = GridSearchCV(ml_pipe, param_grid, cv=kf)
gs.fit(X, y_encode)

GridSearch result can be converted into DataFrame to show the score of each parameter combination


In [None]:
print(gs.best_score_)
print(gs.best_params_)
pd.DataFrame(gs.cv_results_)

## Step by step demonstration of transform_step -> Pipeline -> ColumnTransformers

### Transformation step demonstration

In [7]:
## Demosntration of how two transform_steps (one-hot encoding and SimpleImputer) work on a column (chose 'education')

edu_train = df[['education']].copy()

# One-hot encoding
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
edu_train_transformed = ohe.fit_transform(edu_train)
feat_name = ohe.get_feature_names()
# print(feat_name)
print(feat_name[edu_train_transformed[0]==1])
print(edu_train.loc[0])

# SimpleImputer
edu_train.iloc[0, 0] = np.nan
si = SimpleImputer(strategy='constant', fill_value='MISSING')
edu_train_imputed = si.fit_transform(edu_train)
edu_train_imputed

['x0_ Bachelors']
education     Bachelors
Name: 0, dtype: object


array([['MISSING'],
       [' Bachelors'],
       [' HS-grad'],
       ...,
       [' HS-grad'],
       [' HS-grad'],
       [' HS-grad']], dtype=object)

### Pipeline demonstration

In [8]:
# Organise the SimpleImputer step into a pipeline-ready format (tuple with ('name', transform_step_func))
si_step = ('si', SimpleImputer(strategy='constant',
                fill_value='MISSING'))
ohe_step = ('ohe', OneHotEncoder(sparse=False,
                handle_unknown='ignore'))

# put the transformation_step into a list and initialise a pipeline with the tranformation step list
steps = [si_step, ohe_step]
pipe = Pipeline(steps)

# Test the pipeline
edu_train = df[['education']].copy()
edu_train.iloc[0, 0] = np.nan
edu_transformed = pipe.fit_transform(edu_train)
edu_transformed.shape

(32561, 17)

### ColumnTranformation demonstration

In [10]:
edu_transformers = [('edu_tf', pipe, ['education'])]
edu_ct = ColumnTransformer(transformers=edu_transformers)

X_edu_processed = edu_ct.fit_transform(df_X)
X_edu_processed.shape
pl = edu_ct.named_transformers_['edu_tf'] # Getting pipeline results back from the transformer
ohe = pl.named_steps['ohe'] # Getting ohe result from the pipeline
print(ohe.get_feature_names())

['x0_ 10th' 'x0_ 11th' 'x0_ 12th' 'x0_ 1st-4th' 'x0_ 5th-6th'
 'x0_ 7th-8th' 'x0_ 9th' 'x0_ Assoc-acdm' 'x0_ Assoc-voc' 'x0_ Bachelors'
 'x0_ Doctorate' 'x0_ HS-grad' 'x0_ Masters' 'x0_ Preschool'
 'x0_ Prof-school' 'x0_ Some-college']


### Combining more than two pipelines into ColumnTransformation

In [11]:
# Make another ColumnTransformer for numerical feature
num_si_step = ('si', SimpleImputer(strategy='median'))
num_ss_step = ('ss', StandardScaler())

num_steps = [num_si_step, num_ss_step]
num_pipe = Pipeline(num_steps)
num_transformers = [('num', num_pipe, num_col)]
num_ct = ColumnTransformer(transformers=num_transformers)

# Combine both ColumnTransformer
comb_transformers = [('edu', pipe, ['education']), \
                         ('num', num_pipe, num_col)]
comb_ct = ColumnTransformer(transformers=comb_transformers)

X_comb_processed = comb_ct.fit_transform(df_X)
X_comb_processed.shape

(32561, 20)

### Add Machine learning step using Pipeline again

In [16]:
ml_pipe = Pipeline([('transform', comb_ct), ('dt', tree.DecisionTreeClassifier())])
X_train, X_test, y_train, y_test = train_test_split(df_X, y_encode, test_size=0.2, random_state=42)
ml_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('transform',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='MISSING',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                              