In [1]:
import pandas as pd 
import numpy as np 
import sklearn
import matplotlib.pyplot as plt
import gc

In [2]:
#Shrink down as much as we can the size of the dataframes.
#Note that every numerical value lies within the range indexed with a float/int of 32 bits
#Moreover is wise to convert every object feature into a category one, especially if the number of unique values is far from the number of rows

import sys

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object'):
            df[c] = df[c].astype('category')
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [3]:
#Load preprocessed train and convert types and sanity check
train=pd.read_csv('../../data/train.csv')

In [4]:
train=convert_types(train, print_info=True)

Original Memory Usage: 0.46 gb.
New Memory Usage: 0.23 gb.


In [5]:
#MOTIVATION: We are not loan's domain experts, thus ... (see notes on ipad)

In [6]:
from sklearn.preprocessing import LabelEncoder

def onehot_binenc(df, type = 'object'):
    """ dtype should be 'object' or 'category' depending on the dataframe being converted or not """
    
    le = sklearn.preprocessing.LabelEncoder()
    #counter for binary categorical features
    bcount = 0
    #find feaures w two categories and transform them either to 0 or 1
    for col in df.columns:
        if df[col].dtype == type and len(df[col].unique()) <= 2 :
            le.fit(df[col])
            df[col]=le.transform(df[col])
            bcount+=1
    
    #one hot encoding of the remaining k-categorical features, w/ k>2. If there's any
    if (bcount < df.shape[1]):
        df = pd.get_dummies(df)
    return df
        

In [7]:
#### 4. Merging ####
#define a function to left join two datasets by handling separately numerical features and categorical ones
def join_w_stats(id, df1, df2, df2_name):
    """ Merge two dataframes (df1 and df2) by grouping df2 on id and computing the following statistics:
            i) Mean, Min and Max and sum for numeric features
            ii) Mean for categorical features 
        In this way, indeed, we hope to preserve the essence of the information stored in each feature after groub by"""

    #drop from df2 the id column since it is not necessary and won't be used anymore
    df2 = df2.drop([col for col in df2.columns if col.startswith('SK_ID') and col != id], axis=1)
    newcolumns = []
    
    
    #compute statistics for numerical feats, if there's any
    numericaldf2 = df2.select_dtypes(include='number')
    count_numericalcols = len(numericaldf2.columns)
    if count_numericalcols > 1: #1 is the id
        
        numericaldf2[id] = df2[id]
        numstatsdf2 = numericaldf2.groupby(id).agg(['mean', 'max', 'min', 'sum']).reset_index()

        #create new columns names for each numerical feature_stat
        for col in numstatsdf2.columns.levels[0]: 
            if col != id:
                #loop through every subcolumn name
                for stat in numstatsdf2.columns.levels[1][:-1]:
                    newcolumns.append('%s_%s_%s' % (df2_name, col, stat))

   
    #compute mean for categorical feats, if there's any
    categorical = False
    if (len(df2.columns) - count_numericalcols) > 0:
        categoricaldf2 = df2.select_dtypes(include='category')
        categorical = True
        onehotdf2 = onehot_binenc(categoricaldf2, 'category')
        onehotdf2[id] = df2[id]
        onehotstatsdf2 = onehotdf2.groupby(id).agg(['mean']).reset_index()
    
        #create new columns names for each categorical feature_stat
        for col in onehotstatsdf2.columns.levels[0]: 
            if col != id:
                #for categoricals the only subcolumn is the mean
                newcolumns.append('%s_%s_mean' % (df2_name, col))

    # df2 no longer needed. Free memory
    gc.enable()
    del df2
    gc.collect()

    #merge both numerical and categorical (if there is any) statistics dsets grouped by id. And then with df1
    if categorical == True:
        numstatsdf2 = numstatsdf2.join(onehotstatsdf2.set_index(id), on=id)
        
    #add new columns names    
    numstatsdf2.columns = [id]+newcolumns 
    #left join on id df1 w/ merged statistics of df2
    df1joindf2 = df1.join(numstatsdf2.set_index(id), on=id)


    # df1 no longer needed. Free memory
    gc.enable()
    del df1
    gc.collect()

    # some cast might happen during merge and groupby, thus convert again
    convert_types(df1joindf2)

    return df1joindf2

In [8]:
previousApplication = pd.read_csv('../../data/previous_application.csv')
installmentsPayments = pd.read_csv('../../data/installments_payments.csv')

In [9]:
previousApplication = convert_types(previousApplication, print_info=True)

Original Memory Usage: 0.49 gb.
New Memory Usage: 0.17 gb.


In [10]:
installmentsPayments = convert_types(installmentsPayments, print_info=True)

Original Memory Usage: 0.87 gb.
New Memory Usage: 0.44 gb.


In [11]:
previousJOINinstallments = join_w_stats('SK_ID_PREV', previousApplication, installmentsPayments, 'installments')

In [12]:
cashBalance = pd.read_csv('../../data/POS_CASH_balance.csv')

In [13]:
cashBalance = convert_types(cashBalance, print_info=True)

Original Memory Usage: 0.64 gb.
New Memory Usage: 0.29 gb.


In [14]:
previousJOINcashBalanceJOINinstallments = join_w_stats('SK_ID_PREV', previousJOINinstallments, cashBalance, 'cash')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
creditCardBalance = pd.read_csv('../../data/credit_card_balance.csv')

In [16]:
creditCardBalance = convert_types(creditCardBalance, print_info=True)

Original Memory Usage: 0.71 gb.
New Memory Usage: 0.34 gb.


In [17]:
previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance = join_w_stats('SK_ID_PREV', previousJOINcashBalanceJOINinstallments, creditCardBalance, 'creditcard')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
#every time we are saving a csv, dtypes are lost by default. Define the following read and write function to preserve converted types in the first row to avoid another conversion after every loading.

import os
import json

def to_csv(df, path):
    
    dtypes = df.dtypes.apply(lambda x: x.name).to_dict()
    jtypes = json.dumps(dtypes)

    fileName = os.path.splitext(path)

    # save df as usual along with a json representation of the dictionary
    df.to_csv(path, index=False)

    f = open(fileName[0]+'Types',"w")
    f.write(jtypes)
    f.close()

    # free memory
    gc.enable()
    del df
    gc.collect()

def read_csv(path):
    
    fileName = os.path.splitext(path)
    
    jtypes = json.load(open(fileName[0]+'Types'))
    
    return pd.read_csv(path, dtype=jtypes)

In [19]:
trainJOINprev = join_w_stats('SK_ID_CURR', train, previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, 'prev')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [20]:
# store joined previous loans dara since it will be merged to test as well
to_csv(previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, '../../data/previousJoined.csv')

In [21]:
# remove all features with more than 60% of N.a.N
def remove_missing_columns(df, threshold = 60):
    # Calculate missing stats for df (remember to calculate a percent!)
    df_miss = pd.DataFrame(df.isnull().sum())
    df_miss['percent'] = 100 * df_miss[0] / len(df)
    
    
    # list of missing columns for df
    missing_df_columns = list(df_miss.index[df_miss['percent'] > threshold])
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_df_columns), threshold))
    
    # Drop the missing columns and return
    df = df.drop(columns = missing_df_columns)
    
    return df

In [22]:
#drop all the new computed features that we consider no-influent from trainJOINprev
trainJOINprev = remove_missing_columns(trainJOINprev)

There are 267 columns with greater than 60% missing values.


In [23]:
bureauBalance = pd.read_csv('../../data/bureau_balance.csv')
bureau = pd.read_csv('../../data/bureau.csv')

In [24]:
bureauBalance = convert_types(bureauBalance, print_info=True)
bureau = convert_types(bureau, print_info=True)

Original Memory Usage: 0.66 gb.
New Memory Usage: 0.25 gb.
Original Memory Usage: 0.23 gb.
New Memory Usage: 0.1 gb.


In [25]:
bureauJOINbureauBalance = join_w_stats('SK_ID_BUREAU', bureau, bureauBalance, 'bureauBalance')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [26]:
trainJoined = join_w_stats('SK_ID_CURR', trainJOINprev, bureauJOINbureauBalance, 'bureau')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
trainJoined = remove_missing_columns(trainJoined)

There are 39 columns with greater than 60% missing values.


In [28]:
to_csv(bureauJOINbureauBalance, '../../data/bureauJoined.csv')

In [29]:
# store the final train
to_csv(trainJoined, '../../data/trainjoined.csv')

In [30]:
## apply the same logic to the test ##
test = pd.read_csv('../../data/test.csv')
test = convert_types(test, print_info=True)

Original Memory Usage: 0.07 gb.
New Memory Usage: 0.04 gb.


In [31]:
# previously stored, read infering the right types
prevJoined = read_csv('../../data/previousJoined.csv')

In [32]:
testJOINprev = join_w_stats('SK_ID_CURR', test, prevJoined, 'prev')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [33]:
testJOINprev = remove_missing_columns(testJOINprev)

There are 267 columns with greater than 60% missing values.


In [34]:
bureauJOINbureauBalance = read_csv('../../data/bureauJoined.csv')

In [35]:
testJoined = join_w_stats('SK_ID_CURR', testJOINprev, bureauJOINbureauBalance, 'bureau')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [36]:
testJoined = remove_missing_columns(testJoined)

There are 0 columns with greater than 60% missing values.


In [37]:
trainJoined = read_csv('../../data/trainjoined.csv')

In [38]:
## it seems like there were more sparse features in train than in test that were removed, this is reasonable due to the larger ids in train
print(len(trainJoined.columns))
print(len(testJoined.columns))

778
816


In [39]:
# need to align as we did in homedefault_traintest

target = trainJoined['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
trainJoined, testJoined = trainJoined.align(testJoined, join = 'inner', axis = 1)

#Add the target back in
trainJoined['TARGET'] = target

trainJoined.shape

(307511, 778)

In [40]:
testJoined.shape

(48744, 777)

In [41]:
to_csv(trainJoined, '../../data/trainjoined.csv')

In [42]:
to_csv(testJoined, '../../data/testjoined.csv')

In [43]:
# Continue to the feat engineering phase