In [15]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# prepare data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
# models and metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
# 
import sys

In [3]:
def split_show_results(model, data, split_size=0.3):
    X_train, X_test, y_train, y_test = my_split(data=data, split_size=split_size)
    model.fit(X_train, y_train)
    results = model.predict_proba(X_test)[:, 1]
    print('Auc Score: %f' %roc_auc_score(y_test, results))
    plot_roc_curve(model, X_test, y_test)
    plt.title(str(model) + 'Roc curve')
    plt.show()

In [5]:
def show_results(model, X_train, X_test, y_train, y_test, case='case'):
    model.fit(X_train, y_train)
    results = model.predict_proba(X_test)[:, 1]
    plt.title('Failure probabilities for '  + case)
    plt.xlabel('Probabilities')
    plt.ylabel('Count')
    plt.hist(results, bins=100)
    plt.show()
    # print('Auc Score: %f' %roc_auc_score(y_test, results))
    plot_roc_curve(model, X_test, y_test)
    plt.title('Roc curve of ' + case)
    plt.show()

In [6]:
def kde_plot(df, feature):
    plt.figure(figsize = (10, 8))
    # KDE plot of loans that were repaid on time
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # KDE plot of loans which were not repaid on time
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    # Labeling of plot
    plt.xlabel(str(feature))
    plt.ylabel('Density')
    plt.title('Distribution of ' + str(feature))

In [7]:
def target_corrs(df):
    corrs = []
    for col in df.columns: 
        if col != 'TARGET':
            corr = df['TARGET'].corr(df[col], method='pearson')
            corrs.append((corr))
    cols = df.columns.tolist()
    cols.remove('TARGET')
    df_corrs = pd.DataFrame()
    df_corrs['feature'] = cols
    df_corrs['abs_corr'] = np.abs(corrs)
    df_corrs = df_corrs.sort_values(by=['abs_corr'], ascending=False)
    return df_corrs

In [8]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [9]:
def label_encoder(df):
    le_count = 0
    le_column = []
    le = LabelEncoder()
    for col in df:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 2:
                # Train on the training data
                le.fit(df[col])
                # Transform both training and testing data
                df[col] = le.transform(df[col])
                # Keep track of how many columns were label encoded
                le_count += 1
                le_column.append(col)
                      
    print('%d columns were label encoded.' % le_count , le_column)
    return df

In [10]:
def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)


def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [11]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [12]:
# the perfect heatmap

def heatmap(x, y, size, color):
    """
    built out of a tutorial seen on towardsdatascience.com

    Enhance heatmap for feature correlation observation

    """
    
    fig, ax = plt.subplots(figsize=(5, 5))
    # Mapping from column names to integer coordinates
    x_labels = [v for v in x.unique()]
    y_labels = [v for v in y.unique()]
    x_to_num = {p[1]: p[0] for p in enumerate(x_labels)}
    y_to_num = {p[1]: p[0] for p in enumerate(y_labels)}
    size_scale = 500
    # Use 256 colors for the diverging color palette
    n_colors = 256
    # Create the palette
    palette = sns.diverging_palette(20, 220, n=n_colors)
    # Range of values mapped to the palt, i.e. min and max poss corr
    color_min, color_max = [-1, 1]

    def value_to_color(val):
        # pos of value in input range, relative to length of input range
        val_position = float((val - color_min)) / (color_max - color_min)
        # target index in the color palette
        ind = int(val_position * (n_colors - 1))
        return palette[ind]

    # Setup a 1x40 Grid
    plot_grid = plt.GridSpec(1, 40, hspace=0.2, wspace=0.1)
    # Use the leftmost 39 columns of the grid for the main plot
    ax = plt.subplot(plot_grid[:, :-1])

    ax.scatter(
        x=x.map(x_to_num),  # Use mapping for x
        y=y.map(y_to_num),  # Use mapping for y
        s=size * size_scale,  # Vector sq sizes
        c=color.apply(value_to_color),  # Vector sq color values
        marker='s'  # Use square as scatterplot marker
    )

    # Show column labels on the axes
    ax.set_xticks([x_to_num[v] for v in x_labels])
    ax.set_xticklabels(x_labels, rotation=45, horizontalalignment='right')
    ax.set_yticks([y_to_num[v] for v in y_labels])
    ax.set_yticklabels(y_labels)
    ax.grid(False, 'major')
    ax.grid(True, 'minor')
    ax.set_xticks([t + 0.5 for t in ax.get_xticks()], minor=True)
    ax.set_yticks([t + 0.5 for t in ax.get_yticks()], minor=True)

    ax.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5])
    ax.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])

    # Add color legend on the right side of the plot
    ax = plt.subplot(plot_grid[:, -1])  # Use the rightmost column of the plot

    # Fixed x coordinate for the bars
    col_x = [0]*len(palette)
    # y coordinates for each of the n_colors bars
    bar_y = np.linspace(color_min, color_max, n_colors)

    bar_height = bar_y[1] - bar_y[0]
    ax.barh(
        y=bar_y,
        width=[5]*len(palette),  # Make bars 5 units wide
        left=col_x,  # Make bars start at 0
        height=bar_height,
        color=palette,
        linewidth=0
    )
    # Bars are going from 0 to 5, so lets crop the plot somewhere in the middle
    ax.set_xlim(1, 2)
    # Hide grid
    ax.grid(False)
    # Make background white
    ax.set_facecolor('white')
    # Remove horizontal ticks
    ax.set_xticks([])
    # Show vertical ticks for min, middle and max
    ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3))
    # Show vertical ticks on the right
    ax.yaxis.tick_right()


In [13]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [14]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical