# Pipelines - Automating data preprocessing



In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [2]:
import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order(100,100,100,dups=100, percent_nans=0.2)
df.iloc[1,3]='"-,.."'
df.head()

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,119.002907,small,black,Richard Lucius,8
1,111.310142,small,red,"""-,..""",11
2,102.410028,small,orange,Roxanne Brinson,15
3,76.170022,small,green,Jennifer Kelly,16
4,97.900388,small,orange,Elizabeth Case,16


# Transforms

Here is a suggested list of steps to automate data pre processing, given in the order they should be applied.  

1. Impute NaNs (or delete if there are not too many) --have to decide on strategy
2. Process strings
3. Delete duplicates  -- have to decide which fields to consider
4. Determine categorical columns
   a. convert ordinal categorical columns to numeric (may need to generate a dict for this)
   b. convert nominal categorical columns to one hot encoded columns
5. At this point all relevant data is numeric
6. Drop no variance columns (df.col.nunique returns 1)
7. Scale data (use standardization)
8. Feature Reduction-Dump correlated columns
9. Feature Reduction-apply PCA

This list is not complete, nor exact; for instance you might have the following columns in your dataset.

![](./43_pipeline_complete_preprocess_img2.png)

LocationDesc is a nominal categorical variable, 1 hot encoding it will add 49 additional columns to Dataset.  GeoLocation, OTOH, can be split into just 2 columns which provide both state  AND proximity to other states information. SO drop LocationDesc, keep and convert GeoLocation.

## Import Transforms that live in ../utils.transforms folder

If you want, or just use the ut prefix to get to them after you import utils as ut

In [3]:
#if you import like this you can use the function names directly
#without the ut. prefix
from utils.transforms import *

## Apply transforms


In [6]:
#save these to operate on, can just as easily operate on original df
dfs=df.copy()
dfp=df.copy()
dfp2=df.copy()

### Either sequentially

if you import the functions via from utils.transforms import *

In [5]:
dfs=impute_NaNs(dfs)
dfs=ps_lower_strip(dfs,['name'])
dfs=ps_replace_punctuation(dfs,['name'])
dfs=remove_duplicates(dfs,['name'])

# is the hand coded bit for nominal cat var
vals ={'t_shirt_size': {'large': 2, 'medium': 1, 'small': 0}}
dfs=cat_ordinal(dfs, ['t_shirt_size'], vals)
dfs=drop_no_variance_columns(dfs)
dfs=scale(dfs)   #put this here if you do not want to scale the dummies
dfs=cat_getdummies(dfs, ['t_shirt_color'])
dfs=drop_correlated_columns(dfs)
dfs.head()

Fixing 63 NaNs using most_frequent strategy
Removing 15 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.522789,-0.941101,richard lucius,-1.743531,0.0,0.0,0.0,0.0
1,-0.716424,-0.941101,,-0.687557,0.0,0.0,0.0,1.0
2,-0.940448,-0.941101,roxanne brinson,0.720409,0.0,0.0,1.0,0.0
3,-1.600934,-0.941101,jennifer kelly,1.072401,0.0,1.0,0.0,0.0
4,-1.05396,-0.941101,elizabeth case,1.072401,0.0,0.0,1.0,0.0


### Or as a pipeline (functionally equivelent to above). A pipeline is just a way to take the output of 1 function and feed it into another multiple times.  Just like above but in a 1 liner

In [7]:
import utils as ut
#run a pipeline of transforms, note all functions are from ut namespace
dfp2=dfp2.pipe(ut.impute_NaNs).pipe(ut.ps_lower_strip).pipe(ut.ps_replace_punctuation,['name'])
#whatever custom work I need here
dfp2=dfp2.pipe(ut.remove_duplicates,['name']).pipe(ut.cat_ordinal,['t_shirt_size'],vals).pipe(ut.drop_no_variance_columns).pipe(ut.scale).pipe(ut.cat_getdummies, ['t_shirt_color']).pipe(ut.drop_correlated_columns)
dfp2.head()

Fixing 63 NaNs using most_frequent strategy
Removing 15 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.522789,-0.941101,richard lucius,-1.743531,0.0,0.0,0.0,0.0
1,-0.716424,-0.941101,,-0.687557,0.0,0.0,0.0,1.0
2,-0.940448,-0.941101,roxanne brinson,0.720409,0.0,0.0,1.0,0.0
3,-1.600934,-0.941101,jennifer kelly,1.072401,0.0,1.0,0.0,0.0
4,-1.05396,-0.941101,elizabeth case,1.072401,0.0,0.0,1.0,0.0


In [17]:
dfp2.info()

#notice that everything is numerical (except for name), name is not very useful for a ML algorithm
dfp2.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                301 non-null    float64
 1   t_shirt_size          301 non-null    float64
 2   name                  301 non-null    object 
 3   Age                   301 non-null    float64
 4   t_shirt_color_blue    301 non-null    float64
 5   t_shirt_color_green   301 non-null    float64
 6   t_shirt_color_orange  301 non-null    float64
 7   t_shirt_color_red     301 non-null    float64
dtypes: float64(7), object(1)
memory usage: 18.9+ KB


Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,301.0,301.0,301.0,301.0,301.0,301.0,301.0
mean,-2.832729e-16,0.0,-1.534395e-16,0.215947,0.146179,0.189369,0.259136
std,1.001665,1.001665,1.001665,0.412163,0.353874,0.392454,0.43889
min,-1.78728,-1.446479,-1.459727,0.0,0.0,0.0,0.0
25%,-0.8250819,-1.446479,-0.777463,0.0,0.0,0.0,0.0
50%,-0.001803638,-0.247057,-0.09519955,0.0,0.0,0.0,0.0
75%,0.6932615,0.952365,0.9281957,0.0,0.0,0.0,1.0
max,2.903851,0.952365,1.610459,1.0,1.0,1.0,1.0


## Save to feather

In [18]:
dfs.to_feather('preprocess1.feather')
dfs = pd.read_feather('preprocess1.feather')
dfs

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.400050,-1.446479,don west,0.928196,0.0,0.0,1.0,0.0
1,-1.317165,-1.446479,,0.928196,0.0,0.0,0.0,1.0
2,-0.708208,0.952365,james giese,-1.459727,0.0,0.0,0.0,1.0
3,-1.416734,0.952365,bobby conyer,-0.095200,0.0,0.0,1.0,0.0
4,-0.830013,-1.446479,adriana jackson,-0.436331,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
296,1.315866,0.952365,anthony wingerd,-0.095200,0.0,0.0,0.0,0.0
297,2.099913,0.952365,stanley williams,-0.777463,0.0,0.0,0.0,1.0
298,-0.284399,0.952365,john zayicek,-1.118595,0.0,0.0,1.0,0.0
299,1.902469,0.952365,leticia terry,0.245932,0.0,1.0,0.0,0.0
