# Pipelines  - to ease pre processing

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [2]:

import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order()
df

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,100.627405,small,black,Clarice Sonoski,17
1,86.231887,small,black,Micheal Vargas,9
2,98.563183,small,orange,Catherine Garcia,13
3,107.608842,small,blue,Louis Wallace,15
4,103.062907,small,green,Carl Dietrich,10
...,...,...,...,...,...
295,211.129020,large,red,Phyllis Rogers,16
296,145.618181,large,orange,Everett Walters,12
297,156.467627,large,black,Michael Higgins,9
298,176.818893,large,green,Angela Sanchez,11


## Set up transforms

These are the transforms I want to run on the data, be careful of the order, you want to convert all categorical to numerical before you scale ALL numerical columns

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
def scale(df, features, scaler):
    '''
    scales numerical_features using the provided scaler 

    df: dataframe to operate on
    features: a list of columns to apply to
    scaler: function that operates on df's features
    return: transformed df
    '''
    df[features] = scaler.fit_transform(df[features])
    return df


def cat_nominal(df, features, order):
    '''
    apply a numerical order on nominal features

    df: dataframe to operate on
    features: a list of columns to apply to (likely 1)
    order: custom ordering dictionary, very likely hand generated
    return: transformed df
    '''
    for feat in features:
        df[feat] = df[feat].map(order)
    return df


def cat_getdummies(df, features):
    '''
    get dummy vars for each feature

    df: dataframe to operate on
    features: a list of columns to apply to
    return: transformed df
    '''
    for feat in features:
        df = pd.get_dummies(df, columns=[feat])
    return df


## Apply functions in a pipeline
transform strings and categoricals first, then transform all numerical columns (which will include the categoricals)

In [5]:
# this is the hand coded bit for nominal cat var
vals = {'large': 2, 'medium': 1, 'small': 0}

#run a pipeline of transforms
df_clean = (df.pipe(cat_nominal, ['t_shirt_size'], vals).
            pipe(cat_getdummies, ['t_shirt_color']).
            pipe(scale, ['weight', 't_shirt_size', 'Age'], StandardScaler()))

In [6]:
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.974078,-1.224745,Clarice Sonoski,1.622773,1,0,0,0,0
1,-1.356981,-1.224745,Micheal Vargas,-1.112238,1,0,0,0,0
2,-1.028984,-1.224745,Catherine Garcia,0.255268,0,0,0,1,0
3,-0.788381,-1.224745,Louis Wallace,0.939021,0,1,0,0,0
4,-0.909297,-1.224745,Carl Dietrich,-0.770361,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
295,1.965127,1.224745,Phyllis Rogers,1.280897,0,0,0,0,1
296,0.222620,1.224745,Everett Walters,-0.086609,0,0,0,1,0
297,0.511202,1.224745,Michael Higgins,-1.112238,1,0,0,0,0
298,1.052520,1.224745,Angela Sanchez,-0.428485,0,0,1,0,0


In [7]:
df_clean.info()
#notice that everything is numerical (except for name), name is not very useful for a ML algorithm

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                300 non-null    float64
 1   t_shirt_size          300 non-null    float64
 2   name                  300 non-null    object 
 3   Age                   300 non-null    float64
 4   t_shirt_color_black   300 non-null    uint8  
 5   t_shirt_color_blue    300 non-null    uint8  
 6   t_shirt_color_green   300 non-null    uint8  
 7   t_shirt_color_orange  300 non-null    uint8  
 8   t_shirt_color_red     300 non-null    uint8  
dtypes: float64(3), object(1), uint8(5)
memory usage: 11.0+ KB


In [8]:
df_clean.describe()

Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,2.73855e-16,5.921189e-18,-1.176836e-16,0.236667,0.22,0.18,0.183333,0.18
std,1.001671,1.001671,1.001671,0.425746,0.414938,0.384829,0.387586,0.384829
min,-1.721838,-1.224745,-1.454114,0.0,0.0,0.0,0.0,0.0
25%,-0.8614983,-1.224745,-0.7703615,0.0,0.0,0.0,0.0,0.0
50%,0.08515078,0.0,-0.08660869,0.0,0.0,0.0,0.0,0.0
75%,0.5543206,1.224745,0.9390205,0.0,0.0,0.0,0.0,0.0
max,3.198118,1.224745,1.622773,1.0,1.0,1.0,1.0,1.0


## Save to feather

In [9]:
df_clean.to_feather('preprocess.feather')

In [10]:
df_clean = pd.read_feather('preprocess.feather')
df_clean

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.974078,-1.224745,Clarice Sonoski,1.622773,1,0,0,0,0
1,-1.356981,-1.224745,Micheal Vargas,-1.112238,1,0,0,0,0
2,-1.028984,-1.224745,Catherine Garcia,0.255268,0,0,0,1,0
3,-0.788381,-1.224745,Louis Wallace,0.939021,0,1,0,0,0
4,-0.909297,-1.224745,Carl Dietrich,-0.770361,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
295,1.965127,1.224745,Phyllis Rogers,1.280897,0,0,0,0,1
296,0.222620,1.224745,Everett Walters,-0.086609,0,0,0,1,0
297,0.511202,1.224745,Michael Higgins,-1.112238,1,0,0,0,0
298,1.052520,1.224745,Angela Sanchez,-0.428485,0,0,1,0,0


## Except for the name column, this dataframe is now ready for ML algorithms