In [ ]:
import pandas as pd 
import numpy as np 
import sklearn
import matplotlib.pyplot as plt
import gc

In [ ]:
#Load preprocessed train
train=pd.read_csv('../../data/train.csv')

In [ ]:
#MOTIVATION: We are not loan's domain experts, thus ... (see notes on ipad)

In [ ]:
from sklearn.preprocessing import LabelEncoder

def onehot_binenc(df, type = 'object'):
    """ dtype should be 'object' or 'category' depending on the dataframe being converted or not """
    
    le = sklearn.preprocessing.LabelEncoder()
    #counter for binary categorical features
    bcount = 0
    #find feaures w two categories and transform them either to 0 or 1
    for col in df.columns:
        if df[col].dtype == type and len(df[col].unique()) <= 2 :
            le.fit(df[col])
            df[col]=le.transform(df[col])
            bcount+=1
    
    #one hot encoding of the remaining k-categorical features, w/ k>2. If there's any
    if (bcount < df.shape[1]):
        df = pd.get_dummies(df)
    return df
        

In [ ]:
#### 4. Merging ####
#define a function to left join two datasets by handling separately numerical features and categorical ones
def join_w_stats(id, df1, df2, df2_name):
    """ Merge two dataframes (df1 and df2) by grouping df2 on id and computing the following statistics:
            i) Mean, Min and Max and sum for numeric features
            ii) Mean for categorical features 
        In this way, indeed, we hope to preserve the essence of the information stored in each feature after groub by"""

    #drop from df2 the id column since it is not necessary and won't be used anymore
    df2 = df2.drop([col for col in df2.columns if col.startswith('SK_ID') and col != id], axis=1)
    newcolumns = []
    
    
    #compute statistics for numerical feats, if there's any
    numericaldf2 = df2.select_dtypes(include='number')
    count_numericalcols = len(numericaldf2.columns)
    if count_numericalcols > 1: #1 is the id
        
        numericaldf2[id] = df2[id]
        numstatsdf2 = numericaldf2.groupby(id).agg(['mean', 'max', 'min', 'sum']).reset_index()

        #create new columns names for each numerical feature_stat
        for col in numstatsdf2.columns.levels[0]: 
            if col != id:
                #loop through every subcolumn name
                for stat in numstatsdf2.columns.levels[1][:-1]:
                    newcolumns.append('%s_%s_%s' % (df2_name, col, stat))

   
    #compute mean for categorical feats, if there's any
    categorical = False
    if (len(df2.columns) - count_numericalcols) > 0:
        categoricaldf2 = df2.select_dtypes(include='category')
        categorical = True
        onehotdf2 = onehot_binenc(categoricaldf2, 'category')
        onehotdf2[id] = df2[id]
        onehotstatsdf2 = onehotdf2.groupby(id).agg(['mean']).reset_index()
    
        #create new columns names for each categorical feature_stat
        for col in onehotstatsdf2.columns.levels[0]: 
            if col != id:
                #for categoricals the only subcolumn is the mean
                newcolumns.append('%s_%s_mean' % (df2_name, col))


    #merge both numerical and categorical (if there is any) statistics dsets grouped by id. And then with df1
    if categorical == True:
        numstatsdf2 = numstatsdf2.join(onehotstatsdf2.set_index(id), on=id)
        
    #add new columns names    
    numstatsdf2.columns = [id]+newcolumns 
    #left join on id df1 w/ merged statistics of df2
    df1joindf2 = df1.join(numstatsdf2.set_index(id), on=id)


    gc.enable()
    del df1, df2
    gc.collect()


    return df1joindf2

In [ ]:
#Shrink down as much as we can the size of the dataframes.
#Note that every numerical value lies within the range indexed with a float/int of 32 bits
#Moreover is wise to convert every object feature into a category one, especially if the number of unique values is far from the number of rows

import sys

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [ ]:
previousApplication = pd.read_csv('../../data/previous_application.csv')
installmentsPayments = pd.read_csv('../../data/installments_payments.csv')

In [ ]:
previousApplication = convert_types(previousApplication, print_info=True)

In [ ]:
installmentsPayments = convert_types(installmentsPayments, print_info=True)

In [ ]:
previousJOINcashBalance = join_w_stats('SK_ID_PREV', previousApplication, installmentsPayments, 'installments') # SHOULD BE named previousJOINinstallments

In [ ]:
import gc
# Remove variables to free memory
gc.enable()
del previousApplication, installmentsPayments 
gc.collect()

In [ ]:
cashBalance = pd.read_csv('../../data/POS_CASH_balance.csv')

In [ ]:
cashBalance = convert_types(cashBalance, print_info=True)

In [ ]:
previousJOINcashBalanceJOINinstallments = join_w_stats('SK_ID_PREV', previousJOINcashBalance, cashBalance, 'cash')

In [ ]:
import gc
# Remove variables to free memory
gc.enable()
del previousJOINcashBalance, cashBalance
gc.collect()

In [ ]:
creditCardBalance = pd.read_csv('../../data/credit_card_balance.csv')

In [ ]:
creditCardBalance.head()

In [ ]:
creditCardBalance = convert_types(creditCardBalance, print_info=True)

In [ ]:
previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance = join_w_stats('SK_ID_PREV', previousJOINcashBalanceJOINinstallments, creditCardBalance, 'creditcard')

In [ ]:
gc.enable()
del previousJOINcashBalanceJOINinstallments, creditCardBalance
gc.collect()

In [ ]:
previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance.shape

In [ ]:
# store joined previous loans dara since it will be merged to test as well
previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance.to_csv('../../data/previousJoined.csv', index = False)

In [ ]:
trainJOINprev = join_w_stats('SK_ID_CURR', train, previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, 'prev')

In [ ]:
gc.enable()
del previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, train
gc.collect()

In [ ]:
# remove all features with more than 60% of N.a.N
def remove_missing_columns(df, threshold = 60):
    # Calculate missing stats for df (remember to calculate a percent!)
    df_miss = pd.DataFrame(df.isnull().sum())
    df_miss['percent'] = 100 * df_miss[0] / len(df)
    
    
    # list of missing columns for df
    missing_df_columns = list(df_miss.index[df_miss['percent'] > threshold])
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_df_columns), threshold))
    
    # Drop the missing columns and return
    df = df.drop(columns = missing_df_columns)
    
    return df

In [ ]:
#drop all the new computed features that we consider no-influent from trainJOINprev
trainJOINprev = remove_missing_columns(trainJOINprev)


In [ ]:
bureauBalance = pd.read_csv('../../data/bureau_balance.csv')
bureau = pd.read_csv('../../data/bureau.csv')

In [ ]:
bureauBalance = convert_types(bureauBalance, print_info=True)
bureau = convert_types(bureau, print_info=True)

In [ ]:
bureauJOINbureauBalance = join_w_stats('SK_ID_BUREAU', bureau, bureauBalance, 'bureauBalance')

In [ ]:
import gc

gc.enable()
del bureau, bureauBalance
gc.collect()

In [ ]:
trainJoined = join_w_stats('SK_ID_CURR', trainJOINprev, bureauJOINbureauBalance, 'bureau')

In [ ]:
bureauJOINbureauBalance.to_csv('../../data/bureauJoined.csv')

In [ ]:
gc.enable()
del trainJOINprev, bureauJOINbureauBalance
gc.collect()

In [ ]:
trainJoined = remove_missing_columns(trainJoined)

In [ ]:
# store the final train
trainJoined.to_csv('../../data/trainjoined.csv', index = False)

In [ ]:
## apply the same logic to the test ##
test = pd.read_csv('../../data/test.csv')
test = convert_types(test, print_info=True)

In [ ]:
prevJoined = pd.read_csv('../../data/previousJoined.csv')

In [ ]:
#every time we are reading, dtypes are lazily computed by pandas, this means that what we converted to category would be object again, the same for numericals, thus convert again.
prevJoined = convert_types(prevJoined, print_info=True)

In [ ]:
testJOINprev = join_w_stats('SK_ID_CURR', test, prevJoined, 'prev')

In [ ]:
testJOINprev = remove_missing_columns(testJOINprev)

In [ ]:
# As before, as we are reading we need to convert the types
bureauJOINbureauBalance = pd.read_csv('../../data/bureauJoined.csv')
bureauJOINbureauBalance = convert_types(bureauJOINbureauBalance, print_info=True)

In [ ]:
testJoined = join_w_stats('SK_ID_CURR', testJOINprev, bureauJOINbureauBalance, 'bureau')

In [ ]:
testJoined = remove_missing_columns(testJoined)

In [ ]:
trainJoined = pd.read_csv('../../data/trainjoined.csv')
trainJoined = convert_types(trainJoined, print_info=True)

In [ ]:
## it seems like there were more sparse features in train than in shape, this is reasonable due to the larger ids in train
trainJoined.shape
testJoined.shape

In [ ]:
# need to align as we did in homedefault_traintest

target = trainJoined['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
trainJoined, testJoined = trainJoined.align(testJoined, join = 'inner', axis = 1)

#Add the target back in
trainJoined['TARGET'] = target

trainJoined.shape
testJoined.shape

In [ ]:
trainJoined.to_csv('trainjoined.csv', index= False)

In [ ]:
gc.enable()
del trainJoined
gc.collect()

In [ ]:
testJoined.to_csv('testjoined.csv', index= False)

In [ ]:
# Continue to the feat engineering phase