In [1]:
import pandas as pd 
import numpy as np 
import sklearn
import matplotlib.pyplot as plt
import gc
import utilities

In [2]:
#Load preprocessed train and convert types and sanity check
train=pd.read_csv('../../datatmp/data/train.csv')

In [3]:
train.shape

(307511, 162)

In [5]:
# Shrink down size by converting types
train=utilities.convert_types(train, print_info=True)

Original Memory Usage: 0.4 gb.
New Memory Usage: 0.2 gb.


In [6]:
# We are not loan's domain experts, therefore we don't speculate too much on which features are salient or not without looking at correlations. Note that it would also be extremly time consuming given the amount of variables we are dealing with. Indeed, we will collect as many features as possible by merging all datasets and, applying general features reduction techniques and feature importance, we will let our models learn on such data

In [2]:
from sklearn.preprocessing import LabelEncoder

def onehot_binenc(df, type = 'object'):
    """ dtype should be 'object' or 'category' depending on the dataframe being converted or not """
    
    le = sklearn.preprocessing.LabelEncoder()
    #counter for binary categorical features
    bcount = 0
    #find feaures w two categories and transform them either to 0 or 1
    for col in df.columns:
        if df[col].dtype == type and len(df[col].unique()) <= 2 :
            le.fit(df[col])
            df[col]=le.transform(df[col])
            bcount+=1
    
    #one hot encoding of the remaining k-categorical features, w/ k>2. If there's any
    if (bcount < df.shape[1]):
        df = pd.get_dummies(df)
    return df
        

In [3]:
## Merging ##
#define a function to left join two datasets by handling separately numerical features and categorical ones
def join_w_stats(id, df1, df2, df2_name):
    """ Merge two dataframes (df1 and df2) by grouping df2 on id and computing the following statistics:
            i) Mean, Min and Max and sum for numeric features
            ii) Mean for categorical features 
        In this way, indeed, we hope to preserve the essence of the information stored in each feature after groub by"""

    #drop from df2 the id column since it is not necessary and won't be used anymore
    df2 = df2.drop([col for col in df2.columns if col.startswith('SK_ID') and col != id], axis=1)
    newcolumns = []
    
    
    #compute statistics for numerical feats, if there's any
    numericaldf2 = df2.select_dtypes(include='number')
    count_numericalcols = len(numericaldf2.columns)
    if count_numericalcols > 1: #1 is the id
        
        numericaldf2[id] = df2[id]
        numstatsdf2 = numericaldf2.groupby(id).agg(['mean', 'max', 'min', 'sum']).reset_index()

        #create new columns names for each numerical feature_stat
        for col in numstatsdf2.columns.levels[0]: 
            if col != id:
                #loop through every subcolumn name
                for stat in numstatsdf2.columns.levels[1][:-1]:
                    newcolumns.append('%s_%s_%s' % (df2_name, col, stat))

   
    #compute mean for categorical feats, if there's any
    categorical = False
    if (len(df2.columns) - count_numericalcols) > 0:
        categoricaldf2 = df2.select_dtypes(include='category')
        categorical = True
        onehotdf2 = onehot_binenc(categoricaldf2, 'category')
        onehotdf2[id] = df2[id]
        onehotstatsdf2 = onehotdf2.groupby(id).agg(['mean']).reset_index()
    
        #create new columns names for each categorical feature_stat
        for col in onehotstatsdf2.columns.levels[0]: 
            if col != id:
                #for categoricals the only subcolumn is the mean
                newcolumns.append('%s_%s_mean' % (df2_name, col))

    # df2 no longer needed. Free memory
    gc.enable()
    del df2
    gc.collect()

    #merge both numerical and categorical (if there is any) statistics dsets grouped by id. And then with df1
    if categorical == True:
        numstatsdf2 = numstatsdf2.join(onehotstatsdf2.set_index(id), on=id)
        
    #add new columns names    
    numstatsdf2.columns = [id]+newcolumns 
    #left join on id df1 w/ merged statistics of df2
    df1joindf2 = df1.join(numstatsdf2.set_index(id), on=id)


    # df1 no longer needed. Free memory
    gc.enable()
    del df1
    gc.collect()

    # some cast might happen during merge and groupby, thus convert again
    utilities.convert_types(df1joindf2)

    return df1joindf2

In [9]:
previousApplication = pd.read_csv('../../data/previous_application.csv')
installmentsPayments = pd.read_csv('../../data/installments_payments.csv')

In [10]:
previousApplication = utilities.convert_types(previousApplication, print_info=True)

Original Memory Usage: 0.49 gb.
New Memory Usage: 0.17 gb.


In [11]:
# Note: the same anom outlier of DAYS_EMPLOYED occurs in previousapplication as well! handle anomalies in the same way
from sklearn.preprocessing import LabelEncoder
le = sklearn.preprocessing.LabelEncoder()

previousApplication['DAYS_FIRST_DRAWING_ANOM'] = previousApplication["DAYS_FIRST_DRAWING"] == 365243
previousApplication['DAYS_FIRST_DRAWING_ANOM']=le.fit_transform(previousApplication['DAYS_FIRST_DRAWING_ANOM'])
#Replace the anomalous values with median
previousApplication['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
previousApplication['DAYS_FIRST_DRAWING'].fillna(previousApplication['DAYS_FIRST_DRAWING'].median(), inplace=True)

previousApplication['DAYS_FIRST_DUE_ANOM'] = previousApplication["DAYS_FIRST_DUE"] == 365243
previousApplication['DAYS_FIRST_DUE_ANOM']=le.fit_transform(previousApplication['DAYS_FIRST_DUE_ANOM'])
previousApplication['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
previousApplication['DAYS_FIRST_DUE'].fillna(previousApplication['DAYS_FIRST_DUE'].median(), inplace=True)

previousApplication['DAYS_LAST_DUE_1ST_VERSION_ANOM'] = previousApplication["DAYS_LAST_DUE_1ST_VERSION"] == 365243
previousApplication['DAYS_LAST_DUE_1ST_VERSION_ANOM']=le.fit_transform(previousApplication['DAYS_LAST_DUE_1ST_VERSION_ANOM'])
previousApplication['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
previousApplication['DAYS_LAST_DUE_1ST_VERSION'].fillna(previousApplication['DAYS_LAST_DUE_1ST_VERSION'].median(), inplace=True)

previousApplication['DAYS_LAST_DUE_ANOM'] = previousApplication["DAYS_LAST_DUE"] == 365243
previousApplication['DAYS_LAST_DUE_ANOM']=le.fit_transform(previousApplication['DAYS_LAST_DUE_ANOM'])
previousApplication['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
previousApplication['DAYS_LAST_DUE'].fillna(previousApplication['DAYS_LAST_DUE'].median(), inplace=True)

previousApplication['DAYS_TERMINATION_ANOM'] = previousApplication["DAYS_TERMINATION"] == 365243
previousApplication['DAYS_TERMINATION_ANOM']=le.fit_transform(previousApplication['DAYS_TERMINATION_ANOM'])
previousApplication['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
previousApplication['DAYS_TERMINATION'].fillna(previousApplication['DAYS_TERMINATION'].median(), inplace=True)


# from https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features
# Add feature: value ask / value received percentage
# Note: it does create inf values
previousApplication['APP_CREDIT_PERC'] = previousApplication['AMT_APPLICATION'] / previousApplication['AMT_CREDIT']
# Replace inf values with nan
previousApplication['APP_CREDIT_PERC'].replace([np.inf, -np.inf], np.nan, inplace=True)

In [15]:
installmentsPayments = utilities.convert_types(installmentsPayments, print_info=True)

Original Memory Usage: 0.44 gb.
New Memory Usage: 0.44 gb.


In [16]:
previousJOINinstallments = join_w_stats('SK_ID_PREV', previousApplication, installmentsPayments, 'installments')

In [17]:
cashBalance = pd.read_csv('../../data/POS_CASH_balance.csv')

In [18]:
cashBalance = utilities.convert_types(cashBalance, print_info=True)

Original Memory Usage: 0.64 gb.
New Memory Usage: 0.29 gb.


In [19]:
previousJOINcashBalanceJOINinstallments = join_w_stats('SK_ID_PREV', previousJOINinstallments, cashBalance, 'cash')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
creditCardBalance = pd.read_csv('../../data/credit_card_balance.csv')

In [21]:
creditCardBalance = utilities.convert_types(creditCardBalance, print_info=True)

Original Memory Usage: 0.71 gb.
New Memory Usage: 0.34 gb.


In [22]:
previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance = join_w_stats('SK_ID_PREV', previousJOINcashBalanceJOINinstallments, creditCardBalance, 'creditcard')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
trainJOINprev = join_w_stats('SK_ID_CURR', train, previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, 'prev')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [24]:
# store joined previous loans dara since it will be merged to test as well
utilities.to_csv(previousJOINcashBalanceJOINinstallmentsJOINcreditCardBalance, '../../datatmp/data/previousJoined.csv')

In [25]:
# drop all the new computed features that we consider no-influent from trainJOINprev
trainJOINprev = utilities.remove_missing_columns(trainJOINprev)

There are 267 columns with greater than 70% missing values.


In [26]:
bureauBalance = pd.read_csv('../../data/bureau_balance.csv')
bureau = pd.read_csv('../../data/bureau.csv')

In [27]:
bureauBalance = utilities.convert_types(bureauBalance, print_info=True)
bureau = utilities.convert_types(bureau, print_info=True)

Original Memory Usage: 0.66 gb.
New Memory Usage: 0.25 gb.
Original Memory Usage: 0.23 gb.
New Memory Usage: 0.1 gb.


In [28]:
bureauJOINbureauBalance = join_w_stats('SK_ID_BUREAU', bureau, bureauBalance, 'bureauBalance')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
trainJoined = join_w_stats('SK_ID_CURR', trainJOINprev, bureauJOINbureauBalance, 'bureau')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
trainJoined = utilities.remove_missing_columns(trainJoined)

There are 39 columns with greater than 70% missing values.


In [31]:
utilities.to_csv(bureauJOINbureauBalance, '../../datatmp/data/bureauJoined.csv')

In [32]:
# store the final train
utilities.to_csv(trainJoined, '../../datatmp/data/trainjoined.csv')

In [4]:
## apply the same logic to the test ##
test = pd.read_csv('../../datatmp/data/test.csv')
test = utilities.convert_types(test, print_info=True)

Original Memory Usage: 0.06 gb.
New Memory Usage: 0.03 gb.


In [5]:
# previously stored, read and infer the right types
prevJoined = utilities.read_csv('../../datatmp/data/previousJoined.csv')

In [6]:
testJOINprev = join_w_stats('SK_ID_CURR', test, prevJoined, 'prev')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [7]:
testJOINprev = utilities.remove_missing_columns(testJOINprev)

There are 267 columns with greater than 70% missing values.


In [8]:
bureauJOINbureauBalance = utilities.read_csv('../../datatmp/data/bureauJoined.csv')

In [9]:
testJoined = join_w_stats('SK_ID_CURR', testJOINprev, bureauJOINbureauBalance, 'bureau')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
testJoined = utilities.remove_missing_columns(testJoined)

There are 0 columns with greater than 70% missing values.


In [11]:
trainJoined = utilities.read_csv('../../datatmp/data/trainjoined.csv')

In [12]:
## it seems like there were more sparse features in train than in test that were removed, this is reasonable due to the larger ids in train
print(len(trainJoined.columns))
print(len(testJoined.columns))

777
815


In [13]:
# need to align as we did in homedefault_traintest

target = trainJoined['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
trainJoined, testJoined = trainJoined.align(testJoined, join = 'inner', axis = 1)

#Add the target back in
trainJoined['TARGET'] = target

trainJoined.shape

(307511, 777)

In [14]:
testJoined.shape

(48744, 776)

In [15]:
utilities.to_csv(trainJoined, '../../datatmp/data/trainjoined.csv')

In [16]:
utilities.to_csv(testJoined, '../../datatmp/data/testjoined.csv')

In [17]:
# Continue to the feat engineering phase