In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, recall_score,\
accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

## Gather and merge data

In [2]:
inst_pay = pd.read_csv('installments_payments.csv')

In [3]:
inst_pay.shape

(13605401, 8)

## 13,605,401 rows in installments_payments

In [4]:
applic_df = pd.read_csv('application_train.csv')

In [5]:
applic_df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
scores = applic_df[['SK_ID_CURR', 'TARGET']]

In [7]:
inst_scores = pd.merge(inst_pay, scores, on='SK_ID_CURR')

In [8]:
inst_scores.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.36,6948.36,0
3,1682318,161674,1.0,2,-240.0,-243.0,7374.51,7374.51,0
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0


In [9]:
inst_scores.shape

(11591592, 9)

#### We lost 2 million rows... that must have been ids from the test dataset, which doesn't include the target variable

In [10]:
inst_scores[inst_scores.TARGET == 1]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET
1380,1995642,134723,1.0,38,-1277.0,-1282.0,12949.200,12949.200,1
1381,2835050,134723,1.0,4,-215.0,-241.0,16109.415,16109.415,1
1382,2835050,134723,1.0,3,-245.0,-261.0,16109.415,16109.415,1
1383,1969812,134723,0.0,33,-1667.0,-1676.0,7875.000,7875.000,1
1384,1281882,134723,1.0,10,-2398.0,-2417.0,11584.620,11584.620,1
...,...,...,...,...,...,...,...,...,...
11591481,1266186,400041,1.0,1,-295.0,-304.0,5567.940,5567.940,1
11591482,1266186,400041,1.0,2,-265.0,-274.0,5567.940,5567.940,1
11591483,1266186,400041,1.0,3,-235.0,-244.0,5567.940,5567.940,1
11591541,2356737,426345,2.0,1,-718.0,-725.0,89077.320,89077.320,1


In [11]:
inst_scores.isna().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2583
AMT_INSTALMENT               0
AMT_PAYMENT               2583
TARGET                       0
dtype: int64

In [12]:
# since days_entry_payment is counting backwards in time, lets fill nulls with 0
# also since amt_payment is the amount of payment, a null value probably indicates
# non-payment, so lets also fill that with 0s
inst_scores['DAYS_ENTRY_PAYMENT'].fillna(0, inplace=True)
inst_scores['AMT_PAYMENT'].fillna(0, inplace=True)

In [13]:
inst_scores.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.36,6948.36,0
3,1682318,161674,1.0,2,-240.0,-243.0,7374.51,7374.51,0
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0


In [14]:
# lets make a new column showing us the number of day late,
# since days_installment and days_entry_payment seem to be counting back in time(negative),
# a late payment would be if days_entry_payment is greater than days_installment

inst_scores['days_late'] = inst_scores['DAYS_ENTRY_PAYMENT']\
    - inst_scores['DAYS_INSTALMENT']

In [15]:
inst_scores.days_late

0             -7.0
1             -6.0
2             -7.0
3             -3.0
4             -6.0
             ...  
11591587    2788.0
11591588    2087.0
11591589    2757.0
11591590    2727.0
11591591    2057.0
Name: days_late, Length: 11591592, dtype: float64

In [16]:
# a late payment is if days_late is positive
inst_scores['late_payment'] = inst_scores.days_late > 0

In [17]:
inst_scores.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET,days_late,late_payment
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0,-7.0,False
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0,-6.0,False
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.36,6948.36,0,-7.0,False
3,1682318,161674,1.0,2,-240.0,-243.0,7374.51,7374.51,0,-3.0,False
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0,-6.0,False


In [18]:
inst_scores[inst_scores.late_payment == False]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET,days_late,late_payment
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360,0,-7.0,False
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0,-6.0,False
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.360,6948.360,0,-7.0,False
3,1682318,161674,1.0,2,-240.0,-243.0,7374.510,7374.510,0,-3.0,False
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0,-6.0,False
...,...,...,...,...,...,...,...,...,...,...,...
11591553,1756785,440887,1.0,1,-280.0,-283.0,11534.310,11534.310,0,-3.0,False
11591554,1739722,420608,1.0,1,-11.0,-19.0,15447.015,15447.015,0,-8.0,False
11591555,2631378,427056,1.0,5,-2908.0,-2912.0,6987.690,6987.690,0,-4.0,False
11591556,2631378,427056,1.0,6,-2878.0,-2884.0,6977.925,6977.925,0,-6.0,False


In [19]:
inst_scores[inst_scores.late_payment == True]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET,days_late,late_payment
105,1989818,151639,1.0,9,-523.0,-522.0,26197.47,26197.470,0,1.0,True
119,2240513,151639,1.0,3,-2556.0,-2555.0,6789.96,6789.960,0,1.0,True
126,1989818,151639,1.0,13,-403.0,-401.0,26197.47,26067.465,0,2.0,True
144,1989818,151639,1.0,24,-73.0,-72.0,26197.47,26197.470,0,1.0,True
163,1474382,151639,1.0,2,-1158.0,-1156.0,12653.46,10263.825,0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...
11591587,1847946,405063,0.0,16,-2788.0,0.0,0.09,0.000,0,2788.0,True
11591588,1847946,405063,0.0,39,-2087.0,0.0,0.09,0.000,0,2087.0,True
11591589,1847946,405063,0.0,17,-2757.0,0.0,0.09,0.000,0,2757.0,True
11591590,1847946,405063,0.0,18,-2727.0,0.0,0.09,0.000,0,2727.0,True


### Train test split

In [20]:
def split_data(df, random_seed=4233):
    '''
    split_data will take in a DataFrame and a stratify target (default to 'churn')
    random_seed is also asignable (default = 4233 for no reason).
    It will return the data split up for ML models. 
    The return values are: train, validate, test
    '''
    
    # split our df into train_val and test:
    train, test = train_test_split(df,
                                       train_size=0.8,
                                       random_state=random_seed,
                                       stratify=df['TARGET'])
    # return the split DataFrames
    return train, test

In [21]:
train, test = split_data(inst_scores)

In [22]:
train.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET,days_late,late_payment
76712,2445573,164657,2.0,4,-1123.0,-1162.0,94727.16,94727.16,0,-39.0,False
2978261,2295593,167615,1.0,3,-43.0,-62.0,10475.19,10475.19,0,-19.0,False
9775084,1055179,321365,1.0,2,-1584.0,-1606.0,6087.15,6087.15,0,-22.0,False
3248441,1171084,243947,0.0,13,-30.0,-43.0,262.26,180.9,0,-13.0,False
1628721,1113004,126511,0.0,10,-139.0,-139.0,280.17,280.17,1,0.0,False


In [23]:
test.head(1)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET,days_late,late_payment
2169134,1688860,170744,1.0,10,-1204.0,-1228.0,13041.945,13041.945,0,-24.0,False


In [24]:
X_train = train.drop(columns='TARGET')
y_train = train['TARGET']
X_test = test.drop(columns='TARGET')
y_test = test['TARGET']

## Create models

In [25]:
# create the classifier object
dt = DecisionTreeClassifier()
# fit the ting
dt.fit(X_train, y_train)
# use the thing
preds = dt.predict(X_train)

In [29]:
preds.[0]

0