# Pipelines  - to ease pre processing

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [2]:
import utils as ut
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,92.107635,small,black,Sally Volkman,14
1,99.982048,small,blue,Sean Lopez,13
2,105.079222,small,black,Karrie Johnson,13
3,84.862209,small,red,Donald Buchanan,16
4,84.110242,small,orange,Diana Garcia,17
...,...,...,...,...,...
295,228.294064,large,orange,Jennifer Freeman,13
296,184.083520,large,green,Joseph Little,10
297,197.654265,large,black,Brian Bridge,16
298,207.124893,large,green,Sarah Elmer,10


## Set up transforms

These are the transforms I want to run on the data, be careful of the order, you want to convert all categorical to numerical before you scale ALL numerical columns

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
def scale(df, features, scaler):
    '''
    scales numerical_features using the provided scaler 

    df: dataframe to operate on
    features: a list of columns to apply to
    scaler: function that operates on df's features
    return: transformed df
    '''
    df[features] = scaler.fit_transform(df[features])
    return df


def cat_nominal(df, features, order):
    '''
    apply a numerical order on nominal features

    df: dataframe to operate on
    features: a list of columns to apply to (likely 1)
    order: custom ordering dictionary, very likely hand generated
    return: transformed df
    '''
    for feat in features:
        df[feat] = df[feat].map(order)
    return df


def cat_getdummies(df, features):
    '''
    get dummy vars for each feature

    df: dataframe to operate on
    features: a list of columns to apply to
    return: transformed df
    '''
    for feat in features:
        df = pd.get_dummies(df, columns=[feat])
    return df


## Apply functions in a pipeline
transform strings and categoricals first, then transform all numerical columns (which will include the categoricals)

In [5]:
# this is the hand coded bit for nominal cat var
vals = {'large': 2, 'medium': 1, 'small': 0}


df_clean = (df.pipe(cat_nominal, ['t_shirt_size'], vals).
            pipe(cat_getdummies, ['t_shirt_color']).
            pipe(scale, ['weight', 't_shirt_size', 'Age'], StandardScaler()))

In [6]:
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.220145,-1.224745,Sally Volkman,0.403009,1,0,0,0,0
1,-1.030582,-1.224745,Sean Lopez,0.036637,0,1,0,0,0
2,-0.907876,-1.224745,Karrie Johnson,0.036637,1,0,0,0,0
3,-1.394566,-1.224745,Donald Buchanan,1.135752,0,0,0,0,1
4,-1.412668,-1.224745,Diana Garcia,1.502124,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
295,2.058310,1.224745,Jennifer Freeman,0.036637,0,0,0,1,0
296,0.994017,1.224745,Joseph Little,-1.062478,0,0,1,0,0
297,1.320709,1.224745,Brian Bridge,1.135752,1,0,0,0,0
298,1.548698,1.224745,Sarah Elmer,-1.062478,0,0,1,0,0


In [20]:
df_clean.info()
#notice that everything is numerical (except for name), name is not very useful for a ML algorithm

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                300 non-null    float64
 1   t_shirt_size          300 non-null    float64
 2   name                  300 non-null    object 
 3   Age                   300 non-null    float64
 4   t_shirt_color_black   300 non-null    uint8  
 5   t_shirt_color_blue    300 non-null    uint8  
 6   t_shirt_color_green   300 non-null    uint8  
 7   t_shirt_color_orange  300 non-null    uint8  
 8   t_shirt_color_red     300 non-null    uint8  
dtypes: float64(3), object(1), uint8(5)
memory usage: 11.0+ KB


In [13]:
df_clean.describe()

Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,-1.606123e-16,5.921189e-18,-4.366877e-17,0.193333,0.196667,0.213333,0.216667,0.18
std,1.001671,1.001671,1.001671,0.395572,0.398142,0.410346,0.412662,0.384829
min,-1.842373,-1.224745,-1.501336,0.0,0.0,0.0,0.0,0.0
25%,-0.8375717,-1.224745,-0.8168366,0.0,0.0,0.0,0.0,0.0
50%,-0.06819836,0.0,-0.1323367,0.0,0.0,0.0,0.0,0.0
75%,0.7300459,1.224745,0.8944132,0.0,0.0,0.0,0.0,0.0
max,2.772448,1.224745,1.578913,1.0,1.0,1.0,1.0,1.0


## Except for the name column, this dataframe is now ready for ML algorithms