# Pipelines - Automating data preprocessing



In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Load our t-shirts


In [131]:
import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order(100,100,100,dups=100, percent_nans=0.5)
df.iloc[1,3]='"-,.."'
df.head()

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,117.429041,small,blue,Pedro Virgil,15
1,103.545668,small,red,"""-,..""",12
2,,small,blue,,13
3,84.783687,small,red,Dwight Lopez,11
4,123.183768,small,red,Norma Garrett,17


# Transforms

Here is a suggested list of steps to automate data pre processing, given in the order they should be applied.  

1. Impute NaNs (or delete if there are not too many) --have to decide on strategy
2. Process strings
3. Delete duplicates  -- have to decide which fields to consider
4. Determine categorical columns
   a. convert ordinal categorical columns to numeric (may need to generate a dict for this)
   b. convert nominal categorical columns to one hot encoded columns
5. At this point all relevant data is numeric
6. Drop no variance columns (df.col.nunique returns 1)
7. Scale data (use standardization)
8. Feature Reduction-Dump correlated columns
9. Feature Reduction-apply PCA

This list is not complete, nor exact; for instance you might have the following columns in your dataset.

![](./43_pipeline_complete_preprocess_img2.png)

LocationDesc is a nominal categorical variable, 1 hot encoding it will add 49 additional columns to Dataset.  GeoLocation, OTOH, can be split into just 2 columns which provide both state  AND proximity to other states information. SO drop LocationDesc, keep and convert GeoLocation.

## Define Transforms

In [141]:
from sklearn.impute import SimpleImputer

def get_features(df,features, val1='object'):
    if(features is None):
        features=[df.dtypes.index[i] for i,val in enumerate(df.dtypes) if val==val1]
    return features
    
def impute_NaNs(df, strategy='most_frequent',verbose=True):
    '''
    use simple imputer to replace NaNs
    df: dataframe to operate on
    return: transformed df
    '''
    #are there any?
    nans=df.isnull().sum()
    tot=nans.sum()
    if tot==0:
        return df
    
    if verbose == True:
        print(f'Fixing {tot} NaNs using {strategy} strategy')
 
    imp = SimpleImputer(missing_values=np.nan, strategy=strategy)  #works with strings
    nans=[nans.index[i] for i,val in enumerate(nans) if val>0]   
    for val in nans:        
        imp = imp.fit(df[[val]])  #determine replacement  
        df[[val]]=imp.transform(df[[val]])  #here is where the transform is applied 
    return df
  

def ps_lower_strip(df, features=None):
    '''
    preprocesses strings

    df: dataframe to operate on
    features: a list of columns to apply to or all object columns if None
    return: transformed df
    '''
    features=get_features(df,features)
        
    for feat in features:
        df[feat] = df[feat].map(str.lower).map(str.strip)
    return df


import re  #the regular expressions package
def ps_replace_punctuation(df,punc="[!\"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~\`]", replace_with='',features=None):
    '''
    preprocesses strings, replace punctuation

    df: dataframe to operate on
    punc: punction to replace
    replace_with: replacement char
    features: a list of columns to apply to or all object columns if None
    return: transformed df
    '''
    def psp_closure(x):
        return re.sub(punc,replace_with,x)
    
    features=get_features(df,features)
    
    for feat in features:
        df[feat] = df[feat].map(psp_closure)
    return df


def remove_duplicates(df,features, verbose=True):
    '''
    remove duplicate strings, duplicates determined based on columns in features
    returns: transformed df
    '''
    #are there any?
    dups=df.duplicated(subset=features)
    ndups=dups.sum()
    if ndups==0:
        return df
    
    if verbose == True:
        print(f'Removing {ndups} duplicate rows')
        
    df.drop_duplicates(subset=features,inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def cat_nominal(df, features, order):
    '''
    apply a numerical order on nominal features

    df: dataframe to operate on
    features: a list of columns to apply to (likely 1)
    order: custom ordering dictionary, very likely hand generated
    return: transformed df
    '''
    for feat in features:
        df[feat] = df[feat].map(order)
    return df


def cat_getdummies(df, features):
    '''
    get dummy vars for each feature

    df: dataframe to operate on
    features: a list of columns to apply to
    return: transformed df
    '''
    for feat in features:
        df = pd.get_dummies(df, columns=[feat])
    return df


from sklearn.preprocessing import StandardScaler
def scale(df,scaler=StandardScaler(), features=None):
    '''
    scales numerical_features using the provided scaler 

    df: dataframe to operate on
    features: a list of columns to apply to
    scaler: function that operates on df's features
    return: transformed df
    '''
    if(features is None):
        features=[df.dtypes.index[i] for i,val in enumerate(df.dtypes) if val != 'object']
    df[features] = scaler.fit_transform(df[features])
    return df

#find extra correlated columns
def get_correlated_columns(df,correlation_threshold ):
    '''
    df: a dataframe
    correlation_threshold: select all rows and columns that have a correlation >= to this value
    return: list of tuples of form [ (col,row),...]
    '''
    # generate the correlation matrix (abs converts to absolute value, this way we only look for 1 color range)
    corr = df.corr().abs()
    # Generate mask for the upper triangle (see https://seaborn.pydata.org/examples/many_pairwise_correlations.html)
    # the matrix is symmetric, the diagonal (all 1's) and upper triangle are visual noise, use this to mask both out
    mask = np.tril(np.ones_like(corr, dtype=bool), k=-1)    #k=-1 means get rid of the diagonal
    corr = corr.where(cond=mask)
    
    correlated=[]
    for col in corr.columns:
        for i,val in enumerate(corr.loc[col]):
            if( val>= correlation_threshold):
                correlated.append((col,corr.loc[col].index[i]))
    return correlated

def drop_correlated_columns(df,correlation_threshold = .95, verbose=True):
    '''
    df: a dataframe
    return: df with 1 of each 2 correlated columns dropped
    '''
    correlated = get_correlated_columns(df, correlation_threshold)
    while correlated:
        if (verbose==True):
            print(f'dropping column {correlated[0][0]} which is correlated with {correlated[0][1]}')
            
        df.drop(columns=[correlated[0][0]], inplace=True)
        correlated = get_correlated_columns(df, correlation_threshold)
    return df

## Apply transforms


In [90]:
# this is the hand coded bit for nominal cat var
vals = {'large': 2, 'medium': 1, 'small': 0}

In [145]:
#save these to operate on, can just as easily operate on original df
dfs=df.copy()
dfp=df.copy()

### Either sequentially

In [146]:
dfs=impute_NaNs(dfs)
dfs=ps_lower_strip(dfs)
dfs=ps_replace_punctuation(dfs)
dfs=remove_duplicates(dfs,['name'])
dfs=cat_getdummies(dfs, ['t_shirt_color'])
dfs=cat_nominal(dfs, ['t_shirt_size'], vals)
dfs=scale(dfs)
dfs=drop_correlated_columns(dfs)
dfs.head()

Fixing 2 NaNs using most_frequent strategy
Removing 1 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.543306,-1.22988,pedro virgil,0.812315,-0.480055,1.86083,-0.464163,-0.474773,-0.542561
1,-0.918111,-1.22988,,-0.237608,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
2,-1.424622,-1.22988,dwight lopez,-0.587582,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
3,-0.387947,-1.22988,norma garrett,1.512263,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
4,-1.446386,-1.22988,fidel bennett,-1.637504,-0.480055,1.86083,-0.464163,-0.474773,-0.542561


### Or as a pipeline (functionally equivelent to above). A pipeline is just a way to take the output of 1 function and feed it into another multiple times.  Just like above but in a 1 liner

In [147]:
#run a pipeline of transforms
dfp=dfp.pipe(impute_NaNs).pipe(ps_lower_strip).pipe(ps_replace_punctuation).pipe(remove_duplicates,['name']).pipe(cat_getdummies, ['t_shirt_color']).pipe(cat_nominal,['t_shirt_size'],vals).pipe(scale).pipe(drop_correlated_columns)
dfp.head()

Fixing 2 NaNs using most_frequent strategy
Removing 1 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.543306,-1.22988,pedro virgil,0.812315,-0.480055,1.86083,-0.464163,-0.474773,-0.542561
1,-0.918111,-1.22988,,-0.237608,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
2,-1.424622,-1.22988,dwight lopez,-0.587582,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
3,-0.387947,-1.22988,norma garrett,1.512263,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
4,-1.446386,-1.22988,fidel bennett,-1.637504,-0.480055,1.86083,-0.464163,-0.474773,-0.542561


In [148]:
dfp.info()

#notice that everything is numerical (except for name), name is not very useful for a ML algorithm
dfp.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                299 non-null    float64
 1   t_shirt_size          299 non-null    float64
 2   name                  299 non-null    object 
 3   Age                   299 non-null    float64
 4   t_shirt_color_black   299 non-null    float64
 5   t_shirt_color_blue    299 non-null    float64
 6   t_shirt_color_green   299 non-null    float64
 7   t_shirt_color_orange  299 non-null    float64
 8   t_shirt_color_red     299 non-null    float64
dtypes: float64(8), object(1)
memory usage: 21.1+ KB


Unnamed: 0,weight,t_shirt_size,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,8.191144e-16,2.532348e-15,-8.540177000000001e-17,2.933365e-17,1.812003e-16,-7.797553e-18,-5.681074000000001e-17,4.567138e-17
std,1.001676,1.001676,1.001676,1.001676,1.001676,1.001676,1.001676,1.001676
min,-2.041783,-1.22988,-1.637504,-0.4800549,-0.5373947,-0.4641629,-0.4747735,-0.5425609
25%,-0.7767123,-1.22988,-0.9375561,-0.4800549,-0.5373947,-0.4641629,-0.4747735,-0.5425609
50%,-0.08766108,-0.0040996,0.1123663,-0.4800549,-0.5373947,-0.4641629,-0.4747735,-0.5425609
75%,0.6815309,1.221681,0.8123145,-0.4800549,-0.5373947,-0.4641629,-0.4747735,-0.5425609
max,2.607107,1.221681,1.512263,2.083095,1.86083,2.154416,2.106268,1.843111


## Save to feather

In [149]:
dfs.to_feather('preprocess.feather')

In [150]:
dfs = pd.read_feather('preprocess.feather')
dfs

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-0.543306,-1.229880,pedro virgil,0.812315,-0.480055,1.860830,-0.464163,-0.474773,-0.542561
1,-0.918111,-1.229880,,-0.237608,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
2,-1.424622,-1.229880,dwight lopez,-0.587582,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
3,-0.387947,-1.229880,norma garrett,1.512263,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
4,-1.446386,-1.229880,fidel bennett,-1.637504,-0.480055,1.860830,-0.464163,-0.474773,-0.542561
...,...,...,...,...,...,...,...,...,...
294,0.904448,1.221681,robin collins,1.512263,-0.480055,1.860830,-0.464163,-0.474773,-0.542561
295,-0.356009,1.221681,andrew kaupp,-1.637504,-0.480055,-0.537395,-0.464163,2.106268,-0.542561
296,1.815888,1.221681,guadalupe bennett,-0.937556,2.083095,-0.537395,-0.464163,-0.474773,-0.542561
297,2.489805,1.221681,april dupuis,-0.587582,-0.480055,-0.537395,-0.464163,-0.474773,1.843111
