# Pipelines - Automating data preprocessing



In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [2]:
import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order(100,100,100,dups=100, percent_nans=0.2)
df.iloc[1,3]='"-,.."'
df.head()

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,109.523772,small,red,Alex Ridley,8
1,87.855795,small,blue,"""-,..""",16
2,91.398768,small,red,Guadalupe Sanders,12
3,85.307853,small,red,Wayne Wright,16
4,97.579814,small,red,Gary Hyde,15


# Transforms

Here is a suggested list of steps to automate data pre processing, given in the order they should be applied.  

1. Impute NaNs (or delete if there are not too many) --have to decide on strategy
2. Process strings
3. Delete duplicates  -- have to decide which fields to consider
4. Determine categorical columns
   a. convert ordinal categorical columns to numeric (may need to generate a dict for this)
   b. convert nominal categorical columns to one hot encoded columns
5. At this point all relevant data is numeric
6. Drop no variance columns (df.col.nunique returns 1)
7. Scale data (use standardization)
8. Feature Reduction-Dump correlated columns
9. Feature Reduction-apply PCA

This list is not complete, nor exact; for instance you might have the following columns in your dataset.

![](./43_pipeline_complete_preprocess_img2.png)

LocationDesc is a nominal categorical variable, 1 hot encoding it will add 49 additional columns to Dataset.  GeoLocation, OTOH, can be split into just 2 columns which provide both state  AND proximity to other states information. SO drop LocationDesc, keep and convert GeoLocation.

## Import Transforms that live in ../utils.transforms folder

If you want, or just use the ut prefix to get to them after you import utils as ut

In [3]:
#if you import like this you can use the function names directly
#without the ut. prefix
from utils.transforms import *

## Apply transforms


In [4]:
# this is the hand coded bit for ordinal cat var
vals ={'t_shirt_size': {'large': 2, 'medium': 1, 'small': 0}}

In [11]:
#save these to operate on, can just as easily operate on original df
dfs=df.copy()
dfp=df.copy()
dfp2=df.copy()

### Either sequentially

if you import the functions via from utils.transforms import *

In [12]:
dfs=impute_NaNs(dfs)
dfs=ps_lower_strip(dfs,['name'])
dfs=ps_replace_punctuation(dfs,['name'])
dfs=remove_duplicates(dfs,['name'])

# is the hand coded bit for nominal cat var
vals ={'t_shirt_size': {'large': 2, 'medium': 1, 'small': 0}}
dfs=cat_ordinal(dfs, ['t_shirt_size'], vals)
dfs=drop_no_variance_columns(dfs)
dfs=scale(dfs)   #put this here if you do not want to scale the dummies
dfs=cat_getdummies(dfs, ['t_shirt_color'])
dfs=drop_correlated_columns(dfs)
dfs.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.805782,-0.951604,alex ridley,-1.482621,False,False,False,False,True
1,-1.3565,-0.951604,,1.259431,False,True,False,False,False
2,-1.266451,-0.951604,guadalupe sanders,-0.111595,False,False,False,False,True
3,-1.421259,-0.951604,wayne wright,1.259431,False,False,False,False,True
4,-1.109352,-0.951604,gary hyde,0.916675,False,False,False,False,True


### Or as a pipeline (functionally equivelent to above). A pipeline is just a way to take the output of 1 function and feed it into another multiple times.  Just like above but in a 1 liner

In [10]:
import utils as ut
#run a pipeline of transforms, note all functions are from ut namespace
dfp2=dfp2.pipe(ut.impute_NaNs).pipe(ut.ps_lower_strip).pipe(ut.ps_replace_punctuation,['name']).pipe(ut.remove_duplicates,['name']).pipe(ut.cat_ordinal,['t_shirt_size'],vals).pipe(ut.drop_no_variance_columns).pipe(ut.scale).pipe(ut.cat_getdummies, ['t_shirt_color']).pipe(ut.drop_correlated_columns)
dfp2.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


KeyError: "None of [Index(['t_shirt_color'], dtype='object')] are in the [columns]"

In [8]:
dfp.info()

#notice that everything is numerical (except for name), name is not very useful for a ML algorithm
dfp.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   weight         315 non-null    float64
 1   t_shirt_size   252 non-null    object 
 2   t_shirt_color  315 non-null    object 
 3   name           315 non-null    object 
 4   Age            315 non-null    int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 12.4+ KB


Unnamed: 0,weight,Age
count,315.0,315.0
mean,141.23014,12.349206
std,39.377658,2.927787
min,67.423999,8.0
25%,109.523772,10.0
50%,137.790856,12.0
75%,166.583387,15.0
max,240.710805,17.0


## Save to feather

In [9]:
dfs.to_feather('preprocess1.feather')
dfs = pd.read_feather('preprocess1.feather')
dfs

  if _pandas_api.is_sparse(col):


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.805782,-0.951604,alex ridley,-1.482621,False,False,False,False,True
1,-1.356500,-0.951604,,1.259431,False,True,False,False,False
2,-1.266451,-0.951604,guadalupe sanders,-0.111595,False,False,False,False,True
3,-1.421259,-0.951604,wayne wright,1.259431,False,False,False,False,True
4,-1.109352,-0.951604,gary hyde,0.916675,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
296,0.772375,1.455395,richard moore,1.259431,False,False,False,False,True
297,1.216679,1.455395,william norris,0.231161,True,False,False,False,False
298,0.975018,1.455395,janet dannenberg,-0.454352,False,False,False,False,True
299,1.717101,1.455395,randy hutchins,-0.454352,True,False,False,False,False
