# Pipelines - Automating data preprocessing



In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)
    
# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load our t-shirts


In [15]:
import utils as ut

#load raw t-shirt order
df = ut.generate_tshirt_order(100,100,100,dups=100, percent_nans=0.2)
df.iloc[1,3]='"-,.."'
df.head()

Unnamed: 0,weight,t_shirt_size,t_shirt_color,name,Age
0,96.978577,,red,Charles Smith,12
1,84.118154,,black,"""-,..""",12
2,115.042778,small,red,Stephanie Torrence,10
3,88.550694,small,orange,Melissa Demarais,15
4,94.092116,,blue,Janet Trevino,15


# Transforms

Here is a suggested list of steps to automate data pre processing, given in the order they should be applied.  

1. Impute NaNs (or delete if there are not too many) --have to decide on strategy.  BTW what do you do if a column has 50% NaNs? or higher?
2. Process strings
3. Delete duplicates  -- have to decide which fields to consider
4. Determine categorical columns
   a. convert ordinal categorical columns to numeric (may need to generate a dict for this)
   b. convert nominal categorical columns to one hot encoded columns
5. At this point all relevant data is numeric
6. Drop no variance columns (df.col.nunique returns 1)
7. Scale data (use standardization)
8. Feature Reduction-Dump correlated columns
9. Possible Feature Reduction-apply PCA- Fine for plotting data, but if you want to use data for predictive analytics PCA may lose important information.

This list is not complete, nor exact; for instance you might have the following columns in your dataset.

![](./43_pipeline_complete_preprocess_img2.png)

LocationDesc is a nominal categorical variable, 1 hot encoding it will add 49 additional columns to Dataset.  GeoLocation, OTOH, can be split into just 2 columns which provide both state  AND proximity to other states information. SO drop LocationDesc, keep and convert GeoLocation.

## Functions

Define the functions used to transform data here, later move them into utils so they are available for use in other notebooks and scripts

In [7]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

def get_features(df,features, val1='object'):
    if(features is None):
        features=[df.dtypes.index[i] for i,val in enumerate(df.dtypes) if val==val1]
    return features
    
def impute_NaNs(df, strategy='most_frequent',verbose=True):
    '''
    use simple imputer to replace NaNs
    df: dataframe to operate on
    return: transformed df
    '''
    #are there any?
    nans=df.isnull().sum()
    tot=nans.sum()
    if tot==0:
        return df
    
    if verbose == True:
        print(f'Fixing {tot} NaNs using {strategy} strategy')
 
    imp = SimpleImputer(missing_values=np.nan, strategy=strategy)  #works with strings
    nans=[nans.index[i] for i,val in enumerate(nans) if val>0]   
    for val in nans:        
        imp = imp.fit(df[[val]])  #determine replacement  
        df[[val]]=imp.transform(df[[val]])  #here is where the transform is applied 
    return df
  

def ps_lower_strip(df, features=None):
    '''
    preprocesses strings

    df: dataframe to operate on
    features: a list of columns to apply to or all object columns if None
    return: transformed df
    '''
    features=get_features(df,features)
        
    for feat in features:
        df[feat] = df[feat].map(str.lower).map(str.strip)
    return df


import re  #the regular expressions package
def ps_replace_punctuation(df,features,punc="[!\"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~\`]", replace_with=''):
    '''
    preprocesses strings, replace punctuation, be careful not to run this 
    after you have generated a order_dict for cat _ordinal

    df: dataframe to operate on
    punc: punction to replace
    replace_with: replacement char
    features: a list of columns to apply transform to
    return: transformed df
    '''
    def psp_closure(x):
        return re.sub(punc,replace_with,x)
    
    for feat in features:
        df[feat] = df[feat].map(psp_closure)
    return df


def remove_duplicates(df,features=None, verbose=True):
    '''
    remove duplicate strings, duplicates determined based on columns in features
    
    df: dataframe to operate on
    features: a list of columns to consider for duplicates, if None then all considered
    returns: transformed df
    '''
    #are there any?
    dups=df.duplicated(subset=features)
    ndups=dups.sum()
    if ndups==0:
        return df
    
    if verbose == True:
        print(f'Removing {ndups} duplicate rows')
        
    df.drop_duplicates(subset=features,inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def cat_ordinal(df, features, order):
    '''
    apply a numerical order on ordinal features

    df: dataframe to operate on
    features: a list of columns to apply to (likely 1)
    order: custom ordering dictionary of dictionaries, very likely hand generated
    return: transformed df
    
    ex
    features=['education','day_of_week']
    order={'education':{'illiterate':0,'unknown':1,'basic.4y':2, 'high.school':3},
         'day_of_week':{'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5}}
    df=cat_ordinal(df,features,order)
    '''
    for feat in features:
        df[feat] = df[feat].map(order[feat])
    return df


def cat_getdummies(df, features):
    '''
    get dummy vars for each feature

    df: dataframe to operate on
    features: a list of columns to apply to
    return: transformed df
    '''
    for feat in features:
        df = pd.get_dummies(df,drop_first=True, columns=[feat])
    return df


from sklearn.preprocessing import StandardScaler
def scale(df,features=None):
    '''
    scales numerical_features using the provided scaler
    min_max scales all features that only have 2 values
    standard scales all others

    df: dataframe to operate on
    features: a list of columns to apply to
    scaler: function that operates on df's features
    return: transformed df
    '''
    if(features is None):
        features=[df.dtypes.index[i] for i,val in enumerate(df.dtypes) if val != 'object']
        
    #get list of binary columns
    bin_columns=[df.dtypes.index[i] for i,val in enumerate(df.nunique()) if val ==2]
    bin_columns=[val for val in bin_columns if val in features]
    
    #remove binary columns from feature columns
    features=[val for val in features if val not in bin_columns]

    #standard scale features columns
    df[features] = StandardScaler().fit_transform(df[features])
    
    def mm(x):
        '''
        min max scaler
        '''
        #check to see if its already scaled 0->1
        if ( x.min()==0 and x.max()==1):
            return x
        
        return (x-x.min())/(x.max()-x.min())
    df[bin_columns].apply(mm,axis=0)
    
    return df

#find extra correlated columns
def get_correlated_columns(df,correlation_threshold ):
    '''
    df: a dataframe
    correlation_threshold: select all rows and columns that have a correlation >= to this value
    return: list of tuples of form [ (col,row),...]
    '''
    #make sure we do correlations on non-object columns only
    df = df.loc[:, df.dtypes != 'object']
    
    # generate the correlation matrix (abs converts to absolute value, this way we only look for 1 color range)
    corr = df.corr().abs()
    # Generate mask for the upper triangle (see https://seaborn.pydata.org/examples/many_pairwise_correlations.html)
    # the matrix is symmetric, the diagonal (all 1's) and upper triangle are visual noise, use this to mask both out
    mask = np.tril(np.ones_like(corr, dtype=bool), k=-1)    #k=-1 means get rid of the diagonal
    corr = corr.where(cond=mask)
    
    correlated=[]
    for col in corr.columns:
        for i,val in enumerate(corr.loc[col]):
            if( val>= correlation_threshold):
                correlated.append((col,corr.loc[col].index[i]))
    return correlated

def drop_correlated_columns(df,correlation_threshold = .95, verbose=True):
    '''
    Drops 1 of each 2 correlated columns
    CAREFUL WITH THIS ONE< YOU WANT TO DROP THE COLUMN WITH THE LEAST INFORMATION
    df: a dataframe
    return: df with 1 of each 2 correlated columns dropped
    '''
    correlated = get_correlated_columns(df, correlation_threshold)
    while correlated:
        if (verbose==True):
            print(f'dropping column {correlated[0][0]} which is correlated with {correlated[0][1]}')
            
        df.drop(columns=[correlated[0][0]], inplace=True)
        correlated = get_correlated_columns(df, correlation_threshold)
    return df

def drop_no_variance_columns(df,verbose=True):
    '''
    drops all columns that only have 1 value
    df: a dataframe to inspect
    return: df columns that only have 1 value dropped
    '''
    vals=df.nunique()
    
    #get list of columns that only have 1 value
    todrop=[df.dtypes.index[i] for i,val in enumerate(df.nunique()) if val ==1]
    
    #bail if no columns to drop
    if not todrop:
        return df
    
    if(verbose==True):
         print(f'dropping columns {todrop} since each only has 1 value')
    
    #drop em
    df.drop(columns=todrop, inplace=True)
    
    return df

def run_pipeline(df,dup_features, dummy_features, ordinal_features, ordering_dict):
    '''
    runs a pipeline
    
    '''
    return df.pipe(impute_NaNs).pipe(ps_lower_strip,dup_features).pipe(ps_replace_punctuation,dup_features).pipe(remove_duplicates,dup_features).pipe(cat_ordinal,ordinal_features,ordering_dict).pipe(drop_no_variance_columns).pipe(scale).pipe(cat_getdummies, ['t_shirt_color']).pipe(drop_correlated_columns)

# if __name__=='__main__':
#     pass
#     #if running this file as a script (ie python3 transforms1.py')
#     #all code here will run
#     #you can call unit tests from here  

## Apply transforms


In [8]:
#save these to operate on, can just as easily operate on original df
dfs=df.copy()
dfp=df.copy()
dfput=df.copy()

### Either sequentially

In [9]:
dfs=impute_NaNs(dfs)
dfs=ps_lower_strip(dfs,['name'])
dfs=ps_replace_punctuation(dfs,['name'])
dfs=remove_duplicates(dfs,['name'])

# is the hand coded bit for nominal cat var
vals ={'t_shirt_size': {'large': 2, 'medium': 1, 'small': 0}}
dfs=cat_ordinal(dfs, ['t_shirt_size'], vals)  #this will be scaled below
dfs=drop_no_variance_columns(dfs)
dfs=scale(dfs)   #put this here if you do not want to scale the dummies
dfs=cat_getdummies(dfs, ['t_shirt_color'])
dfs=drop_correlated_columns(dfs)  #col name is causing corr to fail
dfs.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.248775,0.933543,ruth burke,0.738398,False,False,False,False
1,-1.14634,0.933543,,1.412928,False,False,True,False
2,-0.830228,-1.48884,darlene thompson,-1.285193,False,False,True,False
3,-1.359079,-1.48884,joel howard,-1.622458,False,False,False,False
4,-0.788028,-1.48884,jerrod watkins,1.412928,True,False,False,False


### Or as a pipeline (functionally equivelent to above). A pipeline is just a way to take the output of 1 function and feed it into another multiple times.  Just like above but in a 1 liner

In [10]:
#run a pipeline of transforms
dfp=dfp.pipe(impute_NaNs).pipe(ps_lower_strip).pipe(ps_replace_punctuation,['name']).pipe(remove_duplicates,['name']).pipe(cat_ordinal,['t_shirt_size'],vals).pipe(drop_no_variance_columns).pipe(scale).pipe(cat_getdummies, ['t_shirt_color']).pipe(drop_correlated_columns)
dfp.head()

Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.248775,0.933543,ruth burke,0.738398,False,False,False,False
1,-1.14634,0.933543,,1.412928,False,False,True,False
2,-0.830228,-1.48884,darlene thompson,-1.285193,False,False,True,False
3,-1.359079,-1.48884,joel howard,-1.622458,False,False,False,False
4,-0.788028,-1.48884,jerrod watkins,1.412928,True,False,False,False


## Move all functions into utils

create file called transforms.py in utils package

move everything in functions above into transforms.py

add following line to \__init__.py in utils folder

`from utils.transforms import *`

This will import all the functions from utils.transforms into any
project that imports utils


### Now call the utils versions


In [12]:
import utils as ut
#run a pipeline of transforms, note all functions are from ut namespace
dfput=dfput.pipe(ut.impute_NaNs).pipe(ut.ps_lower_strip).pipe(ut.ps_replace_punctuation,['name']).pipe(ut.remove_duplicates,['name']).pipe(ut.cat_ordinal,['t_shirt_size'],vals).pipe(ut.drop_no_variance_columns).pipe(ut.scale).pipe(ut.cat_getdummies, ['t_shirt_color']).pipe(ut.drop_correlated_columns)
dfput.head()


Fixing 63 NaNs using most_frequent strategy
Removing 14 duplicate rows


Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.248775,0.933543,ruth burke,0.738398,1.0,0.0,0.0,0.0,0.0
1,-1.14634,0.933543,,1.412928,0.0,0.0,0.0,1.0,0.0
2,-0.830228,-1.48884,darlene thompson,-1.285193,0.0,0.0,0.0,1.0,0.0
3,-1.359079,-1.48884,joel howard,-1.622458,1.0,0.0,0.0,0.0,0.0
4,-0.788028,-1.48884,jerrod watkins,1.412928,0.0,1.0,0.0,0.0,0.0


In [13]:
dfp.info()

#notice that everything is numerical (except for name), name is not very useful for a ML algorithm
dfp.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   weight                301 non-null    float64
 1   t_shirt_size          301 non-null    float64
 2   name                  301 non-null    object 
 3   Age                   301 non-null    float64
 4   t_shirt_color_blue    301 non-null    bool   
 5   t_shirt_color_green   301 non-null    bool   
 6   t_shirt_color_orange  301 non-null    bool   
 7   t_shirt_color_red     301 non-null    bool   
dtypes: bool(4), float64(3), object(1)
memory usage: 10.7+ KB


Unnamed: 0,weight,t_shirt_size,Age
count,301.0,301.0,301.0
mean,-2.832729e-16,0.0,-1.386857e-16
std,1.001665,1.001665,1.001665
min,-1.872141,-1.48884,-1.622458
25%,-0.8390594,-1.48884,-0.9479279
50%,-0.07332273,-0.277649,0.06386748
75%,0.73721,0.933543,0.7383977
max,2.834021,0.933543,1.412928


## Save to feather

In [44]:
dfs.to_feather('preprocess.feather')

In [45]:
dfs = pd.read_feather('preprocess.feather')
dfs

Unnamed: 0,weight,t_shirt_size,name,Age,t_shirt_color_black,t_shirt_color_blue,t_shirt_color_green,t_shirt_color_orange,t_shirt_color_red
0,-1.191319,-1.464412,joan tallant,-1.273945,0,0,0,0,1
1,-0.923382,-1.464412,,1.561230,0,0,0,1,0
2,-1.636402,0.950865,alex hicks,-1.628342,0,0,0,1,0
3,-0.928865,0.950865,sara parkins,-0.210754,0,0,0,0,1
4,-1.002538,-1.464412,john cole,1.206833,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
296,1.979977,0.950865,carl hines,-0.565151,0,1,0,0,0
297,0.164696,0.950865,lester riley,-0.210754,0,1,0,0,0
298,0.848406,0.950865,louis atkins,-0.919548,0,1,0,0,0
299,-0.251783,0.950865,alison mcneil,-1.273945,0,0,0,0,1
