## AutoML Tool - Helper Functions

# ==========================================================

- Author: Arjun Arora
- Date: 2/17/2021

This notebook will help automate required and mundane tasks of the model building process - data cleaning and feature engineering. The notebook assumes that some data exploration and data understanding has been done. The automated tasks conducted are as follows:

1. Drop columns with high % of missing values (threshold can be edited)
2. Drop numerical columns with no variance - all 1s
3. Drop categorical columns with all same labels - all 'Yes'
4. Drop categorical columns with too many labels (threshold can be edited)
5. Impute Missing Values for numerical and categorical data (filling technique can be set)
8. Transforms numerical values in a way that it will increase model accuracy

# ==========================================================

In [63]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

class Automl_tool():
    
    # funtion 1
    def drop_missing_columns(self, df, threshold):      
        
        self.df = df
        self.threshold = threshold
        
        for i in df.columns:
            
            # calculating threshold
            counter = 0
            number_of_rows = df.shape[0]
            number_missing = sum(pd.isnull(df[i]))
            counter = number_missing/ (number_of_rows * 1.0)
            
            # dropping columns
            if counter > threshold:
                df.drop([i], axis = 1, inplace=True) 
            else:
                pass
        
        return df
    
        """
        Objective: Drops columns which has majority of rows missing
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. threshold: Determines which columns will be dropped.
                      if threshold is .9, the columns with 90% missing value will be dropped
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, returns the same dataframe)
        """

    # funtion 2
    def drop_zero_variance_columns(self,df):
        
        self.df = df
        # set numerical columns in variable
        numeric_cols = [var for var in df.columns if data[var].dtypes != 'O']
        
        # iterate over the columns and drop zero variance columns
        for i in numeric_cols:  
            if df[i].std() == 0.0:
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
        
        return df
        
        """
        Objective: Drops numerical columns with zero variance
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, return the same dataframe)
        """
    
    # funtion 3
    def drop_zero_cardinality_columns(self, df):
        
        self.df = df
        
        # set categorical columns
        categorical_cols = df.select_dtypes(include = ['object']).columns
        
        for i in categorical_cols:     
            if df[i].nunique() == 1:
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
            
        return df
        
        """
        Objective: Drops categorical columns with same levels, such as a column with all 'yes' values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, return the same dataframe)
        """        
     
    # funtion 4
    def drop_high_levels(self, df, threshold):
        
        self.df = df
        self.threshold = threshold

        categorical_cols = df.select_dtypes(include = ['object']).columns
        for i in categorical_cols:     
            if len(df[i].value_counts()) > threshold:
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
            
        return df
                
        """
        this task will eliminate categorical columns if this column has a lot of levels. 
        inputs:
        1. Dataframe df: Pandas dataframe
        2. Threshold: How many levels you want at most
        
        outputs:
        1. Dataframe df: updated dataframe without dropped columns
        
        """
    
    # funtion 5
    def replace_missing(self, df, num_val):
        
        self.df = df
        self.num_val = num_val
       
        numeric_cols = [var for var in df.columns if data[var].dtypes != 'O']
        
        if num_val == 'mode':
            df[numeric_cols] = df[numeric_cols].fillna(value = df[numeric_cols].mode())
        
        elif num_val == 'mean':
            df[numeric_cols] = df[numeric_cols].fillna(value = df[numeric_cols].mean())

        elif num_val == 'median':
            df[numeric_cols] = df[numeric_cols].fillna(value = df[numeric_cols].median())

        else: # make 0
            df[numeric_cols] = df[numeric_cols].fillna(value = 0)
        
        # write unknown for categories
        df[df.select_dtypes(include = ['object']).columns.tolist()] = df[df.select_dtypes(include = ['object']).columns.tolist()].fillna(value = 'unknown')
        
        return df
    
        """
        Objective: Replaces missing values with given values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. num_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mean median mode or zero
        3. cat_val: User decides with what values they want to replace the missing categorical values. 
                    This value is 'unknown'
        
        Outputs:
        1. Dataframe df with imputed missing values
        """
    
    # funtion 6
    def transform(self, df, label_name):
        
        self.df = df
        self.label_name = label_name
        
        numeric_cols = [var for var in df.columns if data[var].dtypes != 'O']
        
        for i in numeric_cols:
        
            # initial dictionary
            corr = {'asis':0,'sqrt':0, 'log':0, 'pow2':0}

            # asis correlation
            corr['asis'] = abs(np.corrcoef(df[i], df[label_name])[1][0])

            #pow2 correlation
            corr['pow2'] = abs(np.corrcoef(np.power(df[i].subtract(df[i].mean())/df[i].std(), 2), df[label_name])[1][0])

            # log and sqrt
            if all((df[i]>=0)):
                corr['log'] = abs(np.corrcoef(np.log(df[i] + 0.00001), df[label_name])[1][0])
                corr['sqrt'] = abs(np.corrcoef(np.sqrt(df[i]), df[label_name])[1][0])
            else:
                corr['log'] = 0 
                corr['sqrt'] = 0 
            
            # select highest correlation 
            max_corr_type = max(corr, key=corr.get)

            if max_corr_type == 'sqrt':
                df[i] = np.sqrt(df[i]) # sqrt
            elif max_corr_type == 'pow2':
                df[i] = np.power(df[i],2) # power 2
            elif max_corr_type == 'log':
                df[i] = np.log(df[i] + 0.00001) # log      
            else:
                pass # for asis

        return df
        
        """
        Objective: Transforms numerical values in a way that it will increase model accuracy.
        try asis, sqrt, log, power(2) 
        inputs:
        1. Dataframe df: Pandas dataframe 
        
         outputs:
        1. Dataframe df with transformed values
        """

### Test Cases

In [105]:
data = pd.DataFrame({"A" : np.random.randint(low=1, high=100, size=10),
                     "B"  : np.random.normal(0.0, 1.0, size=10),
                     "C" : [1,2,3,4,5,6,7,8, np.nan,np.nan],
                     "D" : [5] * 10,
                     "E" : ['Yes'] * 10,
                     "F" : ['no','no1','no2','no3','no4'] * 2,
                     "G" : [1,2,1,2,1,2,1,2, 1,np.nan]
                     })
data

Unnamed: 0,A,B,C,D,E,F,G
0,69,-0.997,1.0,5,Yes,no,1.0
1,14,-0.073836,2.0,5,Yes,no1,2.0
2,55,0.302794,3.0,5,Yes,no2,1.0
3,80,-0.09058,4.0,5,Yes,no3,2.0
4,43,0.717231,5.0,5,Yes,no4,1.0
5,90,2.77449,6.0,5,Yes,no,2.0
6,67,1.40507,7.0,5,Yes,no1,1.0
7,80,0.661566,8.0,5,Yes,no2,2.0
8,5,-0.377771,,5,Yes,no3,1.0
9,53,-2.481673,,5,Yes,no4,


In [106]:
# initialize class
automl = Automl_tool()

In [107]:
#Drop columns with high % of missing values (minimum threshold is 10%)
automl.drop_missing_columns(data,0.1)

Unnamed: 0,A,B,D,E,F,G
0,69,-0.997,5,Yes,no,1.0
1,14,-0.073836,5,Yes,no1,2.0
2,55,0.302794,5,Yes,no2,1.0
3,80,-0.09058,5,Yes,no3,2.0
4,43,0.717231,5,Yes,no4,1.0
5,90,2.77449,5,Yes,no,2.0
6,67,1.40507,5,Yes,no1,1.0
7,80,0.661566,5,Yes,no2,2.0
8,5,-0.377771,5,Yes,no3,1.0
9,53,-2.481673,5,Yes,no4,


In [108]:
#Drop numerical columns with no variance - all 1s
automl.drop_zero_variance_columns(data)

Unnamed: 0,A,B,E,F,G
0,69,-0.997,Yes,no,1.0
1,14,-0.073836,Yes,no1,2.0
2,55,0.302794,Yes,no2,1.0
3,80,-0.09058,Yes,no3,2.0
4,43,0.717231,Yes,no4,1.0
5,90,2.77449,Yes,no,2.0
6,67,1.40507,Yes,no1,1.0
7,80,0.661566,Yes,no2,2.0
8,5,-0.377771,Yes,no3,1.0
9,53,-2.481673,Yes,no4,


In [109]:
#Drop categorical columns with all same labels - all 'Yes'
automl.drop_zero_cardinality_columns(data)


Unnamed: 0,A,B,F,G
0,69,-0.997,no,1.0
1,14,-0.073836,no1,2.0
2,55,0.302794,no2,1.0
3,80,-0.09058,no3,2.0
4,43,0.717231,no4,1.0
5,90,2.77449,no,2.0
6,67,1.40507,no1,1.0
7,80,0.661566,no2,2.0
8,5,-0.377771,no3,1.0
9,53,-2.481673,no4,


In [110]:
#Drop categorical columns with too many labels (threshold can be edited)
automl.drop_high_levels(data, 3)

Unnamed: 0,A,B,G
0,69,-0.997,1.0
1,14,-0.073836,2.0
2,55,0.302794,1.0
3,80,-0.09058,2.0
4,43,0.717231,1.0
5,90,2.77449,2.0
6,67,1.40507,1.0
7,80,0.661566,2.0
8,5,-0.377771,1.0
9,53,-2.481673,


In [112]:
#Impute Missing Values for numerical and categorical data (filling technique can be set)
automl.replace_missing(data, 'median')


Unnamed: 0,A,B,G
0,69,-0.997,1.0
1,14,-0.073836,2.0
2,55,0.302794,1.0
3,80,-0.09058,2.0
4,43,0.717231,1.0
5,90,2.77449,2.0
6,67,1.40507,1.0
7,80,0.661566,2.0
8,5,-0.377771,1.0
9,53,-2.481673,1.0


In [113]:
#Transforms numerical values in a way that it will increase model accuracy
automl.transform(data, 'G')

Unnamed: 0,A,B,G
0,69,-0.997,1.0
1,14,-0.073836,2.0
2,55,0.302794,1.0
3,80,-0.09058,2.0
4,43,0.717231,1.0
5,90,2.77449,2.0
6,67,1.40507,1.0
7,80,0.661566,2.0
8,5,-0.377771,1.0
9,53,-2.481673,1.0


# =============================================================