# Pipelines - Automating data preprocessing



In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [2]:
import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order(100,100,100,dups=100, percent_nans=0.2)
df.iloc[1,3]='"-,.."'
df.head()

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,101.359292,,orange,Justin Salter,10
1,112.005713,small,orange,"""-,..""",8
2,111.480172,,green,Anne Ewing,16
3,104.814678,,red,Emily Romig,11
4,106.057529,small,blue,Barbara Kalish,16


# Transforms

Here is a suggested list of steps to automate data pre processing, given in the order they should be applied.  

1. Impute NaNs (or delete if there are not too many) --have to decide on strategy
2. Process strings
3. Delete duplicates  -- have to decide which fields to consider
4. Determine categorical columns
   a. convert ordinal categorical columns to numeric (may need to generate a dict for this)
   b. convert nominal categorical columns to one hot encoded columns
5. At this point all relevant data is numeric
6. Drop no variance columns (df.col.nunique returns 1)
7. Scale data (use standardization)
8. Feature Reduction-Dump correlated columns
9. Feature Reduction-apply PCA

This list is not complete, nor exact; for instance you might have the following columns in your dataset.

![](./43_pipeline_complete_preprocess_img2.png)

LocationDesc is a nominal categorical variable, 1 hot encoding it will add 49 additional columns to Dataset.  GeoLocation, OTOH, can be split into just 2 columns which provide both state  AND proximity to other states information. SO drop LocationDesc, keep and convert GeoLocation.

## Import Transforms that live in ../utils.transforms folder

In [3]:
from utils.transforms import *

## Apply transforms


In [4]:
# this is the hand coded bit for nominal cat var
vals ={'t_shirt_size': {'large': 2, 'medium': 1, 'small': 0}}

In [5]:
#save these to operate on, can just as easily operate on original df
dfs=df.copy()
dfp=df.copy()
dfp2=df.copy()

### Either sequentially

In [6]:
dfs=impute_NaNs(dfs)
dfs=ps_lower_strip(dfs,['name'])
dfs=ps_replace_punctuation(dfs,['name'])
dfs=remove_duplicates(dfs,['name'])
dfs=cat_ordinal(dfs, ['t_shirt_size'], vals)
dfs=drop_no_variance_columns(dfs)
dfs=scale(dfs)   #put this here if you do not want to scale the dummies
dfs=cat_getdummies(dfs, ['t_shirt_color'])
dfs=drop_correlated_columns(dfs)
dfs.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.970431,-0.027524,justin salter,-0.930264,0,0,0,1,0
1,-0.704387,-1.408287,,-1.612382,0,0,0,1,0
2,-0.71752,-0.027524,anne ewing,1.11609,0,0,1,0,0
3,-0.884084,-0.027524,emily romig,-0.589205,0,0,0,0,1
4,-0.853026,-1.408287,barbara kalish,1.11609,0,1,0,0,0


### Or as a pipeline (functionally equivelent to above). A pipeline is just a way to take the output of 1 function and feed it into another multiple times.  Just like above but in a 1 liner

In [7]:
#run a pipeline of transforms
dfp=dfp.pipe(impute_NaNs).pipe(ps_lower_strip).pipe(ps_replace_punctuation,['name']).pipe(remove_duplicates,['name']).pipe(cat_ordinal,['t_shirt_size'],vals).pipe(drop_no_variance_columns).pipe(scale).pipe(cat_getdummies, ['t_shirt_color']).pipe(drop_correlated_columns)
dfp.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.970431,-0.027524,justin salter,-0.930264,0,0,0,1,0
1,-0.704387,-1.408287,,-1.612382,0,0,0,1,0
2,-0.71752,-0.027524,anne ewing,1.11609,0,0,1,0,0
3,-0.884084,-0.027524,emily romig,-0.589205,0,0,0,0,1
4,-0.853026,-1.408287,barbara kalish,1.11609,0,1,0,0,0


### Or call the pipeline function in transforms

In [8]:
# note that this does not remove punctuation
dfp2=run_pipeline(dfp2,dup_features=['name'], dummy_features=['t_shirt_color'], ordinal_features=['t_shirt_size'], ordering_dict=vals)
dfp2.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.970431,-0.027524,justin salter,-0.930264,0,0,0,1,0
1,-0.704387,-1.408287,,-1.612382,0,0,0,1,0
2,-0.71752,-0.027524,anne ewing,1.11609,0,0,1,0,0
3,-0.884084,-0.027524,emily romig,-0.589205,0,0,0,0,1
4,-0.853026,-1.408287,barbara kalish,1.11609,0,1,0,0,0


In [9]:
dfp.info()

#notice that everything is numerical (except for name), name is not very useful for a ML algorithm
dfp.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                301 non-null    float64
 1   t_shirt_size          301 non-null    float64
 2   name                  301 non-null    object 
 3   Age                   301 non-null    float64
 4   t_shirt_color_black   301 non-null    uint8  
 5   t_shirt_color_blue    301 non-null    uint8  
 6   t_shirt_color_green   301 non-null    uint8  
 7   t_shirt_color_orange  301 non-null    uint8  
 8   t_shirt_color_red     301 non-null    uint8  
dtypes: float64(3), object(1), uint8(5)
memory usage: 11.0+ KB


Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0
mean,-2.489703e-16,-1.17514e-15,8.704739e-17,0.169435,0.215947,0.222591,0.192691,0.199336
std,1.001665,1.001665,1.001665,0.375761,0.412163,0.416679,0.395069,0.400166
min,-2.045254,-1.408287,-1.612382,0.0,0.0,0.0,0.0,0.0
25%,-0.7755039,-1.408287,-0.930264,0.0,0.0,0.0,0.0,0.0
50%,-0.1821844,-0.02752351,0.09291309,0.0,0.0,0.0,0.0,0.0
75%,0.6909902,1.353239,0.7750311,0.0,0.0,0.0,0.0,0.0
max,3.489642,1.353239,1.457149,1.0,1.0,1.0,1.0,1.0


## Save to feather

In [10]:
dfs.to_feather('preprocess.feather')

In [11]:
dfs = pd.read_feather('preprocess.feather')
dfs

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.970431,-0.027524,justin salter,-0.930264,0,0,0,1,0
1,-0.704387,-1.408287,,-1.612382,0,0,0,1,0
2,-0.717520,-0.027524,anne ewing,1.116090,0,0,1,0,0
3,-0.884084,-0.027524,emily romig,-0.589205,0,0,0,0,1
4,-0.853026,-1.408287,barbara kalish,1.116090,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
296,2.379837,1.353239,mary butts,0.092913,0,0,0,1,0
297,1.426952,1.353239,bobby mclean,0.433972,0,0,1,0,0
298,1.083089,1.353239,roger rodriguez,-1.612382,0,1,0,0,0
299,0.704991,1.353239,cynthia schenk,-0.589205,0,1,0,0,0
