##The objective of this challenge is to create a machine learning model to predict which individuals are most likely to default on their loans, based on their loan repayment behaviour and ecommerce transaction activity.

##The original problem can be found on Zindi via this link: https://zindi.africa/competitions/sbtic-xente-credit-scoring-challenge

## Feature Engineering section adapted from [Lyle Okoth](https://github.com/LyleOkoth)

##Import Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Accessing training and test files

### Uploading files from computer

In [0]:
from google.colab import files
files.upload()

Saving Test.csv to Test.csv
Saving Train.csv to Train.csv
Saving unlinked_masked_final.csv to unlinked_masked_final.csv
Saving VariableDefinitions.csv to VariableDefinitions.csv


{'Test.csv': b'"CustomerId","TransactionStartTime","Value","Amount","TransactionId","BatchId","SubscriptionId","CurrencyCode","CountryCode","ProviderId","ProductId","ProductCategory","ChannelId","TransactionStatus","IssuedDateLoan","LoanId","InvestorId","LoanApplicationId","ThirdPartyId"\n"CustomerId_310",2019-03-31 13:33:05,14000,-14000,"TransactionId_925","BatchId_1144","SubscriptionId_7","UGX","256","ProviderId_1","ProductId_7","airtime","ChannelId_1","1",2019-03-31 13:33:04,"LoanId_1027","InvestorId_1","LoanApplicationId_825","ThirdPartyId_1175"\n"CustomerId_243",2019-03-31 15:04:09,1000,-1000,"TransactionId_1080","BatchId_1214","SubscriptionId_7","UGX","256","ProviderId_1","ProductId_8","data_bundles","ChannelId_1","1",2019-03-31 15:04:08,"LoanId_768","InvestorId_1","LoanApplicationId_68","ThirdPartyId_604"\n"CustomerId_142",2019-03-31 17:31:11,2500,-2500,"TransactionId_2315","BatchId_2150","SubscriptionId_7","UGX","256","ProviderId_1","ProductId_7","airtime","ChannelId_1","1",201

### Mounting google drive so you can pick files from your Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

## Data Exploration

In [0]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [0]:
len(train_data.columns)

27

In [0]:
len(test_data.columns)

19

In [0]:
test_data_columns = ['TransactionStartTime', 'Value', 'Amount', 'TransactionId', 'BatchId',
                    'SubscriptionId', 'CurrencyCode', 'CountryCode', 'ProviderId', 
                     'ProductId', 'ProductCategory', 'ChannelId', 'TransactionStatus', 
                    'IssuedDateLoan', 'LoanId', 'InvestorId', 'LoanApplicationId', 
                     'ThirdPartyId', 'IsDefaulted']
train_data = train_data[test_data_columns]

In [0]:
train_data.head()

Unnamed: 0,TransactionStartTime,Value,Amount,TransactionId,BatchId,SubscriptionId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,TransactionStatus,IssuedDateLoan,LoanId,InvestorId,LoanApplicationId,ThirdPartyId,IsDefaulted
0,2018-09-21 12:17:39,550.0,-550.0,TransactionId_1683,BatchId_641,SubscriptionId_2,UGX,256,ProviderId_1,ProductId_7,airtime,ChannelId_1,0,,,,,,
1,2018-09-25 09:20:29,550.0,-550.0,TransactionId_2235,BatchId_820,SubscriptionId_2,UGX,256,ProviderId_1,ProductId_7,airtime,ChannelId_1,0,,,,,,
2,2018-09-25 10:33:31,550.0,-550.0,TransactionId_1053,BatchId_210,SubscriptionId_4,UGX,256,ProviderId_1,ProductId_7,airtime,ChannelId_1,0,,,,,,
3,2018-09-27 10:26:41,1000.0,-1000.0,TransactionId_2633,BatchId_876,SubscriptionId_4,UGX,256,ProviderId_1,ProductId_7,airtime,ChannelId_1,0,,,,,,
4,2018-09-27 12:44:21,500.0,-500.0,TransactionId_71,BatchId_1362,SubscriptionId_4,UGX,256,ProviderId_1,ProductId_7,airtime,ChannelId_1,0,,,,,,


In [0]:
train_data['TransactionStartTime'] = pd.to_datetime(train_data['TransactionStartTime'])
train_data['IssuedDateLoan'] = pd.to_datetime(train_data['IssuedDateLoan'])

In [0]:
test_data['TransactionStartTime'] = pd.to_datetime(test_data['TransactionStartTime'])
test_data['IssuedDateLoan'] = pd.to_datetime(test_data['IssuedDateLoan'])

In [0]:
train_data.isnull().sum()

TransactionStartTime      0
Value                     0
Amount                    0
TransactionId             0
BatchId                   0
SubscriptionId            0
CurrencyCode              0
CountryCode               0
ProviderId                0
ProductId                 0
ProductCategory           0
ChannelId                 0
TransactionStatus         0
IssuedDateLoan          612
LoanId                  612
InvestorId              612
LoanApplicationId       617
ThirdPartyId            614
IsDefaulted             612
dtype: int64

In [0]:
test_data.isnull().sum()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 14 columns):
CustomerId              905 non-null object
TransactionStartTime    905 non-null datetime64[ns]
Value                   905 non-null float64
Amount                  905 non-null float64
TransactionId           905 non-null int64
BatchId                 905 non-null int64
SubscriptionId          905 non-null int64
CountryCode             905 non-null int64
ProviderId              905 non-null int64
ProductId               905 non-null int64
ProductCategory         905 non-null object
IssuedDateLoan          478 non-null datetime64[ns]
LoanId                  478 non-null object
InvestorId              478 non-null object
dtypes: datetime64[ns](2), float64(2), int64(6), object(4)
memory usage: 99.1+ KB


In [0]:
train_data = train_data.dropna(subset=['IsDefaulted'])

In [0]:
train_data = train_data.drop(['LoanApplicationId'], axis=1)
train_data = train_data.drop(['ThirdPartyId'], axis=1)

In [0]:
test_data = test_data.drop(['LoanApplicationId'], axis=1)
test_data = test_data.drop(['ThirdPartyId'], axis=1)

## Feature Engineering

In [0]:
pattern = r'\d+'

In [0]:
transaction_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['TransactionId']]
batch_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['BatchId']]
subscription_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['SubscriptionId']]
provider_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['ProviderId']]
product_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['ProductId']]
channel_ids = [int(re.search(pattern, str(value)).group()) for value in train_data['ChannelId']]

In [0]:
transaction_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['TransactionId']]
batch_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['BatchId']]
subscription_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['SubscriptionId']]
provider_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['ProviderId']]
product_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['ProductId']]
channel_ids = [int(re.search(pattern, str(value)).group()) for value in test_data['ChannelId']]

In [0]:
train_data['TransactionId'] = transaction_ids
train_data['BatchId'] = batch_ids
train_data['SubscriptionId'] = subscription_ids
train_data['ProviderId'] = provider_ids
train_data['ProductId'] = product_ids
train_data['ChannelId'] = channel_ids

In [0]:
train_data.drop(['CurrencyCode'], axis=1, inplace=True)
train_data.drop(['ChannelId'], axis=1, inplace=True)
train_data.drop(['TransactionStatus'], axis=1, inplace=True)

In [0]:
test_data['TransactionId'] = transaction_ids
test_data['BatchId'] = batch_ids
test_data['SubscriptionId'] = subscription_ids
test_data['ProviderId'] = provider_ids
test_data['ProductId'] = product_ids
test_data['ChannelId'] = channel_ids

In [0]:
test_data.drop(['CurrencyCode'], axis=1, inplace=True)
test_data.drop(['ChannelId'], axis=1, inplace=True)
test_data.drop(['TransactionStatus'], axis=1, inplace=True)

In [0]:
train_data.head()

Unnamed: 0,TransactionStartTime,Value,Amount,TransactionId,BatchId,SubscriptionId,CountryCode,ProviderId,ProductId,ProductCategory,IssuedDateLoan,LoanId,InvestorId,IsDefaulted
9,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
10,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
11,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
13,2018-10-19 10:18:33,5150.0,-5150.0,1598,1458,4,256,1,7,airtime,2018-10-19 10:18:31,22,3,0.0
14,2018-10-19 11:01:30,515.0,-515.0,2115,1672,4,256,1,7,airtime,2018-10-19 11:01:28,368,3,0.0


In [0]:
train_data.ProductCategory.unique()

array(['airtime', 'data_bundles', 'tv', 'utility_bill', 'movies',
       'retail', 'financial_services'], dtype=object)

In [0]:
prod_cat_enc = pd.get_dummies(train_data.ProductCategory)
prod_cat_enc

Unnamed: 0,airtime,data_bundles,financial_services,movies,retail,tv,utility_bill
9,1,0,0,0,0,0,0
10,1,0,0,0,0,0,0
11,1,0,0,0,0,0,0
13,1,0,0,0,0,0,0
14,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2095,0,1,0,0,0,0,0
2096,1,0,0,0,0,0,0
2097,1,0,0,0,0,0,0
2098,0,1,0,0,0,0,0


In [0]:
test_prod_cat_enc = pd.get_dummies(test_data.ProductCategory)
test_prod_cat_enc

Unnamed: 0,airtime,data_bundles,financial_services,movies,retail,ticket,tv,utility_bill
0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
900,1,0,0,0,0,0,0,0
901,1,0,0,0,0,0,0,0
902,1,0,0,0,0,0,0,0
903,1,0,0,0,0,0,0,0


In [0]:
dates_ = pd.DataFrame()
dates_['TransactionDate'] = [d.date() for d in test_data['TransactionStartTime']]
dates_['TransactionTime'] = [d.time() for d in test_data['TransactionStartTime']]

In [0]:
times_ = pd.DataFrame()
times_['TransactionYear'] = [d.year for d in dates_['TransactionDate']]
times_['TransactionMonth'] = [d.month for d in dates_['TransactionDate']]
times_['TransactionDay'] = [d.day for d in dates_['TransactionDate']]

In [0]:
times_['TransactionHour'] = [d.hour for d in dates_['TransactionTime']]

In [0]:
dates = pd.DataFrame()
dates['TransactionDate'] = [d.date() for d in train_data['TransactionStartTime']]
dates['TransactionTime'] = [d.time() for d in train_data['TransactionStartTime']]

In [0]:
times = pd.DataFrame()
times['TransactionYear'] = [d.year for d in dates['TransactionDate']]
times['TransactionMonth'] = [d.month for d in dates['TransactionDate']]
times['TransactionDay'] = [d.day for d in dates['TransactionDate']]

In [0]:
times['TransactionHour'] = [d.hour for d in dates['TransactionTime']]

In [0]:
def period(hour):
    if hour >= 6 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 18:
        return 'Afternoon'
    elif hour >= 18 and hour < 24:
        return 'Evening'
    else:
        return 'Night'

In [0]:
times['Period'] = times['TransactionHour'].apply(period)

In [0]:
times_['Period'] = times_['TransactionHour'].apply(period)

In [0]:
times.head()

Unnamed: 0,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,Period
0,2018,10,18,16,Afternoon
1,2018,10,18,16,Afternoon
2,2018,10,18,16,Afternoon
3,2018,10,19,10,Morning
4,2018,10,19,11,Morning


In [0]:
period_enc = pd.get_dummies(times.Period)
period_enc.head()

Unnamed: 0,Afternoon,Evening,Morning,Night
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [0]:
test_period_enc = pd.get_dummies(times_.Period)
test_period_enc.head()

Unnamed: 0,Afternoon,Evening,Morning,Night
0,1,0,0,0
1,1,0,0,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [0]:
train_data.head()

Unnamed: 0,TransactionStartTime,Value,Amount,TransactionId,BatchId,SubscriptionId,CountryCode,ProviderId,ProductId,ProductCategory,IssuedDateLoan,LoanId,InvestorId,IsDefaulted
9,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
10,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
11,2018-10-18 16:11:55,10000.0,-10000.0,1041,1970,4,256,1,7,airtime,2018-10-18 16:11:53,62,3,0.0
13,2018-10-19 10:18:33,5150.0,-5150.0,1598,1458,4,256,1,7,airtime,2018-10-19 10:18:31,22,3,0.0
14,2018-10-19 11:01:30,515.0,-515.0,2115,1672,4,256,1,7,airtime,2018-10-19 11:01:28,368,3,0.0


In [0]:
train_data_columns = ['Value', 'Amount', 'TransactionId', 'BatchId', 'SubscriptionId',
                     'ProviderId', 'ProductId', 'IsDefaulted']
train_data = train_data[train_data_columns]
train_data.head()

Unnamed: 0,Value,Amount,TransactionId,BatchId,SubscriptionId,ProviderId,ProductId,IsDefaulted
9,10000.0,-10000.0,1041,1970,4,1,7,0.0
10,10000.0,-10000.0,1041,1970,4,1,7,0.0
11,10000.0,-10000.0,1041,1970,4,1,7,0.0
13,5150.0,-5150.0,1598,1458,4,1,7,0.0
14,515.0,-515.0,2115,1672,4,1,7,0.0


In [0]:
test_data_columns = ['Value', 'Amount', 'TransactionId', 'BatchId', 'SubscriptionId',
                     'ProviderId', 'ProductId']
test_data = test_data[test_data_columns]
test_data.head()

Unnamed: 0,Value,Amount,TransactionId,BatchId,SubscriptionId,ProviderId,ProductId
0,14000.0,-14000.0,925,1144,7,1,7
1,1000.0,-1000.0,1080,1214,7,1,8
2,2500.0,-2500.0,2315,2150,7,1,7
3,500.0,-500.0,1466,1071,7,1,7
4,1000.0,-1000.0,337,2477,7,1,7


In [0]:
times_columns = ['TransactionYear', 'TransactionMonth', 'TransactionDay', 
                 'TransactionHour']
times = times[times_columns]
times.head()

Unnamed: 0,TransactionYear,TransactionMonth,TransactionDay,TransactionHour
0,2018,10,18,16
1,2018,10,18,16
2,2018,10,18,16
3,2018,10,19,10
4,2018,10,19,11


In [0]:
test_times_columns = ['TransactionYear', 'TransactionMonth', 'TransactionDay', 
                 'TransactionHour']
times_ = times_[test_times_columns]
times_.head()

Unnamed: 0,TransactionYear,TransactionMonth,TransactionDay,TransactionHour
0,2019,3,31,13
1,2019,3,31,15
2,2019,3,31,17
3,2019,3,31,17
4,2019,3,31,17


In [0]:
merged_data = pd.concat([train_data, times, period_enc], ignore_index=True, sort=False)

In [0]:
merged_ = pd.concat([test_data, times_, test_period_enc], ignore_index=True, sort=False)

In [0]:
merged_data.head()

Unnamed: 0,Value,Amount,TransactionId,BatchId,SubscriptionId,ProviderId,ProductId,LoanId,InvestorId,IsDefaulted,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,Afternoon,Evening,Morning,Night
0,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,62.0,3.0,0.0,,,,,,,,
1,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,62.0,3.0,0.0,,,,,,,,
2,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,62.0,3.0,0.0,,,,,,,,
3,5150.0,-5150.0,1598.0,1458.0,4.0,1.0,7.0,22.0,3.0,0.0,,,,,,,,
4,515.0,-515.0,2115.0,1672.0,4.0,1.0,7.0,368.0,3.0,0.0,,,,,,,,


In [0]:
data_array = np.c_[times.values, period_enc.values, train_data.values]
merged_data = pd.DataFrame(data_array, columns=['TransactionYear', 'TransactionMonth', 
'TransactionDay', 'TransactionHour', 'Morning', 'AfterNoon', 'Evening', 'Night', 'Value',
'Amount', 'TransactionId', 'BatchId', 'SubscriptionId', 'ProviderId', 'ProductId', 
'IsDefaulted'])
merged_data.head()

Unnamed: 0,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,Morning,AfterNoon,Evening,Night,Value,Amount,TransactionId,BatchId,SubscriptionId,ProviderId,ProductId,IsDefaulted
0,2018.0,10.0,18.0,16.0,0.0,0.0,0.0,1.0,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,0.0
1,2018.0,10.0,18.0,16.0,0.0,0.0,0.0,1.0,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,0.0
2,2018.0,10.0,18.0,16.0,0.0,0.0,0.0,1.0,10000.0,-10000.0,1041.0,1970.0,4.0,1.0,7.0,0.0
3,2018.0,10.0,19.0,10.0,0.0,0.0,0.0,1.0,5150.0,-5150.0,1598.0,1458.0,4.0,1.0,7.0,0.0
4,2018.0,10.0,19.0,11.0,0.0,0.0,0.0,1.0,515.0,-515.0,2115.0,1672.0,4.0,1.0,7.0,0.0


In [0]:
test_data_array = np.c_[times_.values, test_period_enc.values, test_data.values]
merged_ = pd.DataFrame(test_data_array, columns=['TransactionYear', 'TransactionMonth', 
'TransactionDay', 'TransactionHour', 'Morning', 'AfterNoon', 'Evening', 'Night', 'Value',
'Amount', 'TransactionId', 'BatchId', 'SubscriptionId', 'ProviderId', 'ProductId'])
merged_.head()

Unnamed: 0,TransactionYear,TransactionMonth,TransactionDay,TransactionHour,Morning,AfterNoon,Evening,Night,Value,Amount,TransactionId,BatchId,SubscriptionId,ProviderId,ProductId
0,2019.0,3.0,31.0,13.0,1.0,0.0,0.0,0.0,14000.0,-14000.0,925.0,1144.0,7.0,1.0,7.0
1,2019.0,3.0,31.0,15.0,1.0,0.0,0.0,0.0,1000.0,-1000.0,1080.0,1214.0,7.0,1.0,8.0
2,2019.0,3.0,31.0,17.0,0.0,0.0,0.0,1.0,2500.0,-2500.0,2315.0,2150.0,7.0,1.0,7.0
3,2019.0,3.0,31.0,17.0,0.0,0.0,0.0,1.0,500.0,-500.0,1466.0,1071.0,7.0,1.0,7.0
4,2019.0,3.0,31.0,17.0,0.0,0.0,0.0,1.0,1000.0,-1000.0,337.0,2477.0,7.0,1.0,7.0


In [0]:
scaler = StandardScaler()
merged_data['Value'] = scaler.fit_transform(np.array(merged_data['Value']).reshape(-1,1)) 
merged_data['Amount'] = scaler.fit_transform(np.array(merged_data['Amount']).reshape(-1,1))

In [0]:
merged_['Value'] = scaler.fit_transform(np.array(merged_['Value']).reshape(-1,1)) 
merged_['Amount'] = scaler.fit_transform(np.array(merged_['Amount']).reshape(-1,1))

In [0]:
merged_.columns

Index(['TransactionYear', 'TransactionMonth', 'TransactionDay',
       'TransactionHour', 'Morning', 'AfterNoon', 'Evening', 'Night', 'Value',
       'Amount', 'TransactionId', 'BatchId', 'SubscriptionId', 'ProviderId',
       'ProductId'],
      dtype='object')

In [0]:
merged_data.columns

Index(['TransactionYear', 'TransactionMonth', 'TransactionDay',
       'TransactionHour', 'Morning', 'AfterNoon', 'Evening', 'Night', 'Value',
       'Amount', 'TransactionId', 'BatchId', 'SubscriptionId', 'ProviderId',
       'ProductId', 'IsDefaulted'],
      dtype='object')

In [0]:
merged_data.columns
isDefaulted = merged_data['IsDefaulted']
X = merged_data.drop('IsDefaulted', axis=1)

## Model Training & Predictions

### Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg_model = log_reg.fit(X, isDefaulted)
log_reg_model.score(X, isDefaulted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9274193548387096

### Random Forest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=2)
rf = random_forest.fit(X, isDefaulted)
print (rf.score(X, isDefaulted))
y_pred = random_forest.predict(merged_)
submission = pd.DataFrame({'customerid':merged_['TransactionId'],'IsDefaulted':y_pred})
filename = 'submission_rfcc.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

0.9348118279569892
Saved file: submission_rfcc.csv


### XGBoost Classifer

In [0]:
import xgboost as xgb
from xgboost import XGBClassifier
classifier = XGBClassifier(colsample_bylevel= 0.9,
                    colsample_bytree = 0.8, 
                    gamma=0.99,
                    max_depth= 5,
                    min_child_weight= 1,
                    n_estimators= 10,
                    nthread= 4,
                    random_state= 2,
                    silent= True)
classifier.fit(X, isDefaulted).score(X, isDefaulted)

0.9717741935483871

### Gradient Boost Classifier

In [0]:
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier()
model.fit(X, isDefaulted).score(X, isDefaulted)

0.9899193548387096

### Decision Tree Classifier

In [0]:
from sklearn.tree import DecisionTreeClassifier
### Decision Tree

##Building the Decision Tree without pruning
dtree=DecisionTreeClassifier()
dtree_model = dtree.fit(X, isDefaulted)
print (dtree_model.score(X, isDefaulted))

dtree=DecisionTreeClassifier(criterion = "gini", splitter = 'random', max_leaf_nodes = 10, min_samples_leaf = 10, max_depth= 5)
dtree_model = dtree.fit(X, isDefaulted)
print (dtree_model.score(X, isDefaulted))

1.0
0.9307795698924731


## Create Submission File

In [0]:
y_pred = random_forest.predict(merged_)
submission = pd.DataFrame({'customerid':merged_['TransactionId'],'IsDefaulted':y_pred})
filename = 'submission_inclass.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Saved file: submission_inclass.csv
