# Pipelines  - to ease pre processing

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [3]:

import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,103.261965,small,red,Margaret Jose,17
1,65.149766,small,red,Gregory Balson,17
2,106.252194,small,blue,Michael Latsko,11
3,83.752026,small,black,Evelyne James,10
4,103.276743,small,red,Nathan White,9
...,...,...,...,...,...
295,160.738468,large,red,Melvin Rich,9
296,122.837275,large,red,Virginia Wright,15
297,174.520929,large,black,Charles Ozuna,11
298,140.230779,large,green,Georgia Mose,9


## Set up transforms

These are the transforms I want to run on the data, be careful of the order, you want to convert all categorical to numerical before you scale ALL numerical columns

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
def scale(df, features, scaler):
    '''
    scales numerical_features using the provided scaler 

    df: dataframe to operate on
    features: a list of columns to apply to
    scaler: function that operates on df's features
    return: transformed df
    '''
    df[features] = scaler.fit_transform(df[features])
    return df


def cat_nominal(df, features, order):
    '''
    apply a numerical order on nominal features

    df: dataframe to operate on
    features: a list of columns to apply to (likely 1)
    order: custom ordering dictionary, very likely hand generated
    return: transformed df
    '''
    for feat in features:
        df[feat] = df[feat].map(order)
    return df


def cat_getdummies(df, features):
    '''
    get dummy vars for each feature

    df: dataframe to operate on
    features: a list of columns to apply to
    return: transformed df
    '''
    for feat in features:
        df = pd.get_dummies(df, columns=[feat])
    return df


## Apply functions in a pipeline
transform strings and categoricals first, then transform all numerical columns (which will include the categoricals)

In [6]:
# this is the hand coded bit for nominal cat var
vals = {'large': 2, 'medium': 1, 'small': 0}

#run a pipeline of transforms
df_clean = (df.pipe(cat_nominal, ['t_shirt_size'], vals).
            pipe(cat_getdummies, ['t_shirt_color']).
            pipe(scale, ['weight', 't_shirt_size', 'Age'], StandardScaler()))

In [7]:
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.905866,-1.224745,Margaret Jose,1.690063,0,0,0,0,1
1,-1.837820,-1.224745,Gregory Balson,1.690063,0,0,0,0,1
2,-0.832746,-1.224745,Michael Latsko,-0.425454,0,1,0,0,0
3,-1.382940,-1.224745,Evelyne James,-0.778040,1,0,0,0,0
4,-0.905505,-1.224745,Nathan White,-1.130626,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
295,0.499601,1.224745,Melvin Rich,-1.130626,0,0,0,0,1
296,-0.427193,1.224745,Virginia Wright,0.984890,0,0,0,0,1
297,0.836622,1.224745,Charles Ozuna,-0.425454,1,0,0,0,0
298,-0.001872,1.224745,Georgia Mose,-1.130626,0,0,1,0,0


In [8]:
df_clean.info()
#notice that everything is numerical (except for name), name is not very useful for a ML algorithm

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                300 non-null    float64
 1   t_shirt_size          300 non-null    float64
 2   name                  300 non-null    object 
 3   Age                   300 non-null    float64
 4   t_shirt_color_black   300 non-null    uint8  
 5   t_shirt_color_blue    300 non-null    uint8  
 6   t_shirt_color_green   300 non-null    uint8  
 7   t_shirt_color_orange  300 non-null    uint8  
 8   t_shirt_color_red     300 non-null    uint8  
dtypes: float64(3), object(1), uint8(5)
memory usage: 11.0+ KB


In [9]:
df_clean.describe()

Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,5.765758e-16,5.921189e-18,-1.160183e-16,0.2,0.166667,0.19,0.213333,0.23
std,1.001671,1.001671,1.001671,0.400668,0.373301,0.392956,0.410346,0.421536
min,-2.053216,-1.224745,-1.483212,0.0,0.0,0.0,0.0,0.0
25%,-0.8117775,-1.224745,-0.77804,0.0,0.0,0.0,0.0,0.0
50%,-0.07112085,0.0,-0.07286779,0.0,0.0,0.0,0.0,0.0
75%,0.634019,1.224745,0.9848905,0.0,0.0,0.0,0.0,0.0
max,2.591706,1.224745,1.690063,1.0,1.0,1.0,1.0,1.0


## Save to feather

In [10]:
df_clean.to_feather('preprocess.feather')

In [16]:
df_clean = pd.read_feather('preprocess.feather')
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.046914,-1.224745,Clarence Levy,1.199599,0,0,1,0,0
1,-0.894080,-1.224745,Marc Hale,0.840079,0,1,0,0,0
2,-0.950591,-1.224745,Dale Messina,-1.676563,0,0,1,0,0
3,-1.033505,-1.224745,Thelma Kimball,-1.317043,0,0,0,1,0
4,-1.235069,-1.224745,Eric Warner,0.840079,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
295,0.559246,1.224745,John Choi,1.559120,0,0,0,1,0
296,1.106424,1.224745,Cecile Eady,0.480559,0,0,1,0,0
297,1.568671,1.224745,Lance Mancini,1.199599,1,0,0,0,0
298,0.607570,1.224745,Shirley Snider,1.559120,0,0,0,1,0


## Except for the name column, this dataframe is now ready for ML algorithms