# Pipelines  - to ease pre processing

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts
Uh-oh.  utils.py is no longer in this directory.  How do we get to it?  Simple make utils a package and import it. See website for 'how-to'

In [2]:
# I need to add the parent directory of utils in order to find it
#it happens to be up 1 directory.
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

In [3]:
import utils as ut
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,123.860231,small,green,Melanie Fullwood,8
1,114.440420,small,blue,Charles Biederman,14
2,115.054365,small,blue,Suzanne Tiburcio,9
3,116.120780,small,green,Leticia Ruff,17
4,92.263158,small,black,Janet Meyer,15
...,...,...,...,...,...
295,200.870539,large,red,Georgia Applen,15
296,237.229934,large,black,Helen Bennett,10
297,190.899820,large,green,Thomas Prey,16
298,218.553672,large,blue,Randy Wadsworth,17


## Set up transforms

These are the transforms I want to run on the data, be careful of the order, you want to convert all categorical to numerical before you scale all numerical columns

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
def transform(df, features, scaler):
    '''
    copies original dataframe
    transforms numerical_features using the provided scaler
    returns df
    '''
    df[features] = scaler.fit_transform(df[features])
    return df


def cat_t_shirt_size(df, features, order):
    '''
    apply a numerical order on nominal features
    '''
    for feat in features:
        df[feat] = df[feat].map(order)
    return df


def get_dummies(df, features):
    '''
    get dummy vars for each feature
    '''
    for feat in features:
        df = pd.get_dummies(df, columns=[feat])
    return df


## Apply in a pipeline
transform strings and categoricals first, then transform all numerical columns (which will include the categoricals)

In [6]:
# this is the hand coded bit for nominal cat var
vals = {'large': 2, 'medium': 1, 'small': 0}


df_clean = (df.pipe(cat_t_shirt_size, ['t_shirt_size'], vals).
            pipe(get_dummies, ['t_shirt_color']).
            pipe(transform, ['weight', 't_shirt_size', 'Age'], StandardScaler()))

In [7]:
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.500609,-1.224745,Melanie Fullwood,-1.501336,0,0,1,0,0
1,-0.736763,-1.224745,Charles Biederman,0.552163,0,1,0,0,0
2,-0.721371,-1.224745,Suzanne Tiburcio,-1.159087,0,1,0,0,0
3,-0.694636,-1.224745,Leticia Ruff,1.578913,0,0,1,0,0
4,-1.292743,-1.224745,Janet Meyer,0.894413,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
295,1.430028,1.224745,Georgia Applen,0.894413,0,0,0,0,1
296,2.341552,1.224745,Helen Bennett,-0.816837,1,0,0,0,0
297,1.180063,1.224745,Thomas Prey,1.236663,0,0,1,0,0
298,1.873341,1.224745,Randy Wadsworth,1.578913,0,1,0,0,0


In [12]:
df_clean.info()
#notice that everything is numerical (except for name)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                300 non-null    float64
 1   t_shirt_size          300 non-null    float64
 2   name                  300 non-null    object 
 3   Age                   300 non-null    float64
 4   t_shirt_color_black   300 non-null    uint8  
 5   t_shirt_color_blue    300 non-null    uint8  
 6   t_shirt_color_green   300 non-null    uint8  
 7   t_shirt_color_orange  300 non-null    uint8  
 8   t_shirt_color_red     300 non-null    uint8  
dtypes: float64(3), object(1), uint8(5)
memory usage: 11.0+ KB


In [13]:
df_clean.describe()

Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,-1.606123e-16,5.921189e-18,-4.366877e-17,0.193333,0.196667,0.213333,0.216667,0.18
std,1.001671,1.001671,1.001671,0.395572,0.398142,0.410346,0.412662,0.384829
min,-1.842373,-1.224745,-1.501336,0.0,0.0,0.0,0.0,0.0
25%,-0.8375717,-1.224745,-0.8168366,0.0,0.0,0.0,0.0,0.0
50%,-0.06819836,0.0,-0.1323367,0.0,0.0,0.0,0.0,0.0
75%,0.7300459,1.224745,0.8944132,0.0,0.0,0.0,0.0,0.0
max,2.772448,1.224745,1.578913,1.0,1.0,1.0,1.0,1.0
