## AutoML Tool - Helper Functions

# ==========================================================

This notebook will help automate required and mundane tasks of the model building process - data cleaning, feature engineering and feature selection. The notebook assumes that some data exploration and data understanding has been done. The automated tasks conducted are as follows:

1. Drop columns with high % of missing values (threshold can be edited)
2. Drop numerical columns with no variance (means all observations are same)
3. Drop categorical columns with all same labels
4. Drop categorical columns with too many labels (threshold can be edited)
5. Impute Missing Values for numerical and categorical data
6. Encode Target Column to numerical
7. Categorical variables: convert strings to numbers
8. Transforms numerical values in a way that it will increase model accuracy
9. Remove highly correlated features (similar features provide no additional value)


# ==========================================================

In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

class Automl_tool():
    
    def drop_missing_columns(self, df, threshold):      
        self.df = df
        self.threshold = threshold
                
        for i in df.columns:
            # calculating threshold
            counter = 0
            number_of_rows = df.shape[0]
            number_missing = sum(pd.isnull(df[i]))
            counter = number_missing/ (number_of_rows * 1.0)
            
            # dropping columns
            if counter > threshold:
                df.drop([i], axis = 1, inplace=True) 
            else:
                pass
        
        return df
    
        """
        Objective: Drops columns most of whose rows missing
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. threshold: Determines which columns will be dropped.
                      if threshold is .9, the columns with 90% missing value will be dropped
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """

    def drop_zero_variance_columns(self, df):
        
        self.df = df      
        numeric_cols = df.select_dtypes(include = ['float64', 'float32', 'int64']).columns
        for i in numeric_cols:  
            if df[i].std() == 0.0:
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
        
        return df
        
        """
        Objective: Drops numerical columns with zero variance
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """
        
    def drop_zero_cardinality_columns(self, df):
        
        self.df = df
        categorical_cols = df.select_dtypes(include = ['object']).columns
        for i in categorical_cols:     
            if df[i].min() == df[i].max():
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
            
        return df
        
        """
        Objective: Drops categorical columns with same levels, such as a column with all 'yes' values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dropped columns (if no columns are dropped, you will return the same dataframe)
        """        
        
    def drop_high_levels(self, df, threshold):
        
        self.df = df
        self.threshold = threshold
        
        categorical_cols = df.select_dtypes(include = ['object']).columns
        for i in categorical_cols:     
            if len(df[i].value_counts()) > threshold:
                df.drop([i], axis = 1, inplace=True)
            else:
                pass
            
        return df
                
        """
        this task will eliminate categorical columns if this column has a lot of levels. 
        inputs:
        1. Dataframe df: Pandas dataframe
        2. Threshold: How many levels you want at most
        
        outputs:
        1. Dataframe df: updated dataframe without dropped columns
        
        """

    def replace_missing(self, df, num_val):
        
        self.df = df
        self.num_val = num_val
        
        if num_val == 'mode':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].mode())
        
        elif num_val == 'mean':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].mean())
        
        elif num_val == 'median':
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].median())
        else:
            df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()] = df[df.select_dtypes(include = ['float64','float32', 'int']).columns.tolist()].fillna(value = 0, inplace = True)
        
        df[df.select_dtypes(include = ['object']).columns.tolist()] = df[df.select_dtypes(include = ['object']).columns.tolist()].fillna(value = 'unknown')
        
        return df
    
        """
        Objective: Replaces missing values with given values
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        2. num_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mean median mode or zero
        3. cat_val: User decides with what values they want to replace the missing numerical values. 
                    This value can be mode or 'unknown'
        
        Outputs:
        1. Dataframe df with imputed missing values
        """
    
    def encode_target(self, df, target_name):
        
        if df[target_name].dtype == 'object':
            target_levels_cat = df[target_name].value_counts().index.tolist()
            target_levels_num = []
            for i in range(0, len(target_levels_cat)):
                target_levels_num.append(i)
                target_levels = pd.DataFrame([target_levels_cat,target_levels_num ]).T
            target_levels.columns = ['target_level_cat', 'target_level_num']
            
        if df[target_name].dtype == 'object':
            for i in range(0, target_levels.shape[0]):
                df.loc[df[target_name] == target_levels['target_level_cat'][i], target_name] = target_levels['target_level_num'][i]
            
            df[target_name] = df[target_name].astype(float)
            
        return df
         
        """
        Objective: Encodes the class label if class column is categorical.
                   If class column is numerical just return the same dataframe without doing anything
                   Class label might have more than 2 levels (yes and no is two levels)
                   Target levels can be agree, stringly agree, disagree strongly disagree, neutral (5 levels)
                   
        Inputs: 
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with encoded binary class labels. 
        """
        
    def transform(self, df, label_name):
        
        self.df = df
        self.label_name = label_name
        
        numeric_cols = df.select_dtypes(include = ['float64', 'float32', 'int64','uint']).columns
        for i in numeric_cols:
        
            # initial dictionary
            corr = {'asis':0,'sqrt':0, 'log':0, 'pow2':0}

            # asis correlation
            corr['asis'] = abs(np.corrcoef(df[i], df[label_name])[1][0])

            # log and sqrt
            if all((df[i]>=0)):
                corr['log'] = abs(np.corrcoef(np.log(df[i] + 0.00001), df[label_name])[1][0])
                corr['sqrt'] = abs(np.corrcoef(np.sqrt(df[i]), df[label_name])[1][0])
            else:
                corr['log'] = 0 
                corr['sqrt'] = 0 

            #pow2 correlation
            corr['pow2'] = abs(np.corrcoef(np.power(df[i].subtract(df[i].mean())/df[i].std(), 2), df[label_name])[1][0])
            
            # ====================================================================
            # select highest correlation 
            max_corr_type = max(corr, key=corr.get)

            if max_corr_type == 'sqrt':
                df[i] = np.sqrt(df[i]) # sqrt
            elif max_corr_type == 'pow2':
                df[i] = np.power(df[i],2) # power 2
            elif max_corr_type == 'log':
                df[i] = np.log(df[i] + 0.00001) # log      
            else:
                pass # for asis

        return df
        
        """
        Objective: Transforms numerical values in a way that it will increase model accuracy.
        try asis, sqrt, log, power(2) 
        inputs:
        1. Dataframe df: Pandas dataframe 
        
         outputs:
        1. Dataframe df with transformed values
        """
    
    def create_dummies(self, df, label_name):
        
        self.df = df
        self.label_name = label_name
        
        categorical_cols = df.select_dtypes(include = ['object']).columns
        for i in categorical_cols:
            
            if i != label_name:
                df_new = pd.get_dummies(df[i], drop_first=True) # dummy variables
                df = pd.concat([df, df_new], axis = 1) # join to existing dataset 
                df.drop([i], axis = 1, inplace=True) # drop original variable after dummy var is made
                
            else:
                pass
            
        return df

        """
        Objective: Creates dummy variables for categorical variables - not for class
        
        Inputs:
        1. Dataframe df: Pandas dataframe
        
        Outputs:
        1. Dataframe df with dummy variables
        """

    def inspect_multicollinearity(self, df, target_name, threshold):
        """
        this function will help to check whether there is multicollinearity among  independent varialbes incoperate with vif score
        """
        all_input_var = list(set(df.columns.tolist()) - set([target_name]))
        vif = pd.concat([pd.DataFrame(all_input_var), pd.DataFrame([variance_inflation_factor(df[all_input_var].values, ix) for ix in range(df[all_input_var].shape[1])])], axis = 1)
        vif.columns = ['variable', 'vif_score']
        drop_list  = vif.loc[vif.vif_score > threshold, 'variable'].tolist()
        df = df.drop(drop_list, axis = 1)
 

# ==========================================================