In [88]:
import pandas as pd, numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import os
cwd = os.path.abspath(os.getcwd())

loan_file_path = cwd + '/assets/loan_train.csv'
final_file_path = cwd + '/assets/loan_test.csv'
account_file_path = cwd + '/assets/account.csv'
district_file_path = cwd + '/assets/district.csv'
client_file_path = cwd + '/assets/client.csv'
disp_file_path = cwd + '/assets/disp.csv'

loan_data = pd.read_csv(loan_file_path, sep=";")
final_data = pd.read_csv(final_file_path, sep=";")
account_data = pd.read_csv(account_file_path, sep=";")
district_data = pd.read_csv(district_file_path, sep=";")
client_data = pd.read_csv(client_file_path, sep=";")
disp_data = pd.read_csv(disp_file_path, sep=";")

district_data['unemploymant rate \'95 '] = np.where(district_data['unemploymant rate \'95 '] == '?', district_data['unemploymant rate \'96 '], district_data['unemploymant rate \'95 '])
district_data['unemploymant rate \'95 '] = pd.to_numeric(district_data['unemploymant rate \'95 '], errors='coerce')
district_data['no. of commited crimes \'95 '] = np.where(district_data['no. of commited crimes \'95 '] == '?', district_data['no. of commited crimes \'96 '], district_data['no. of commited crimes \'95 '])
district_data['no. of commited crimes \'95 '] = pd.to_numeric(district_data['no. of commited crimes \'95 '], errors='coerce')

acc_type_resolve = {'issuance after transaction': 0, 'monthly issuance': 1, 'weekly issuance': 2 }
account_data['frequency'] = account_data['frequency'].map(acc_type_resolve)

disp_data = disp_data.merge(client_data, on='client_id')

account_data['associated_client_count'] = 0
#account_data['clients_live_together'] = 0
account_data['owner_birth_number'] = 0
account_data['district_mismatch'] = 0
account_data['owner_district'] = 0

for i, row in account_data.iterrows():
    cur_id = row['account_id']
    account_client_list = disp_data[disp_data['account_id'] == cur_id]
    a = account_client_list['district_id'].to_numpy()
    account_data.at[i,'associated_client_count'] = len(account_client_list)
    #account_data.at[i,'clients_live_together'] = int((a[0] == a).all())
    account_data.at[i,'district_mismatch'] = int((a[0] != row['district_id']))
    account_data.at[i,'owner_district_id'] = a[0]
    account_data.at[i,'owner_birth_number'] = account_client_list[account_client_list['type'] == 'OWNER']['birth_number']

temp_data = district_data.add_prefix('owner_')
account_data = account_data.merge(temp_data, left_on='owner_district_id', right_on='owner_code ')
account_data = account_data.drop(columns=['owner_code ', 'owner_name ', 'owner_region'])
#account_data[account_data['district_mismatch'] == 0]

training_data = account_data.merge(loan_data, on='account_id', suffixes=['_account','_loan'])
training_data = district_data.merge(training_data, left_on='code ', right_on='district_id')
training_data = training_data.drop(columns=['account_id', 'code ', 'name ', 'region', 'loan_id'])


average_score = 0
average_accuracy = 0
average_precision = 0
average_recall = 0
n_test = 0
total_test = 2000
total_test_loop = total_test
best_score = 0
best_accuracy = 0
best_precision = 0
best_recall = 0
best_cm = [0,0,0,0]
best_success = 0

for x, temp in training_data.groupby('status'):
    if x == -1:
        t1 = temp
    if x == 1:
        t2 = temp
        
ntd = t1.append(t2.sample(n=len(t1.index) + 140))

print('Processing...')
#for x in range(n_test):
while n_test < total_test_loop:

    sampled_data = np.array_split(ntd.sample(frac=1),2)
    train_data = sampled_data[0]
    test_data = sampled_data[1]

    y = train_data.status
    x = train_data.drop(columns=['status'])

    #criterion="gini", splitter="random", max_depth=5
    dtree = DecisionTreeClassifier(max_depth=5, splitter="best", random_state=0)
    dtree = dtree.fit(x,y)
    y_pred = dtree.predict(test_data.drop(columns=['status']))
    y_test = test_data.status

    cur_cm = confusion_matrix(y_test,y_pred)
    
    cur_accuracy = accuracy_score(y_test, y_pred)
    cur_precision = precision_score(y_test, y_pred)
    cur_recall = recall_score(y_test, y_pred)
    
    cur_score = 2.0 * (cur_precision * cur_recall)/(cur_precision + cur_recall)
    
    average_score += cur_score
    average_accuracy += cur_accuracy
    average_precision += cur_precision
    average_recall += cur_recall
    
    y_pred = dtree.predict(training_data.drop(columns=['status']))
    y_test = training_data.status
    
    [tn,fn],[fp,tp] = confusion_matrix(y_test,y_pred)
    
    error = (fp+fn)/(tp+tn+fp+fn)
    bias = abs((fn - fp)/(tp+tn+fp+fn))
    
    success = cur_score - bias
    
    if success > best_success:
        btree = dtree
        best_score = cur_score
        best_accuracy = cur_accuracy
        best_precision = cur_precision
        best_recall = cur_recall
        best_success = success
        best_cm = cur_cm
        print(success)
        total_test += n_test
        n_test = 0
        
    n_test += 1


average_score /= total_test
average_accuracy /= total_test
average_precision /= total_test
average_recall /= total_test

y_pred = btree.predict(training_data.drop(columns=['status']))
y_test = training_data.status

cur_accuracy = accuracy_score(y_test, y_pred)
cur_precision = precision_score(y_test, y_pred)
cur_recall = recall_score(y_test, y_pred)
cur_score = 2.0 * (cur_precision * cur_recall)/(cur_precision + cur_recall)

print(best_cm)
best_cm = confusion_matrix(y_test,y_pred)
print(best_cm)


print(f'Done!\nEstimated score: {average_score:.2%}')
print(f'Estimated accuracy: {average_accuracy:.2%}')
print(f'Estimated precision: {average_precision:.2%}')
print(f'Estimated recall: {average_recall:.2%}')
print(f'Final result accuracy: {best_accuracy:.2%}')
print(f'Final result precision: {best_precision:.2%}')
print(f'Final result recall: {best_recall:.2%}')
print(f'Final result score: {best_score:.2%}')
print(f'Complete accuracy: {cur_accuracy:.2%}')
print(f'Complete precision: {cur_precision:.2%}')
print(f'Complete recall: {cur_recall:.2%}')
print(f'Complete score: {cur_score:.2%}')

predict_data = account_data.merge(final_data, on='account_id', suffixes=['_account','_loan'])
predict_data = district_data.merge(predict_data, left_on='code ', right_on='district_id')
predict_data = predict_data.drop(columns=['account_id', 'code ', 'name ', 'region'])

predict_data['status'] = btree.predict(predict_data.drop(columns=['status','loan_id']))

from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(btree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = test_data.drop(columns=['status']).columns,class_names=['-1','1'])
try:
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_png('result.png')
    print(predict_data[['loan_id','status']])
    predict_data[['loan_id','status']].rename(columns={'loan_id': 'Id','status': 'Predicted'}).to_csv('result.csv', index=False)
except:
    print('Graph failed!')
    print(predict_data[['loan_id','status']])
    predict_data[['loan_id','status']].rename(columns={'loan_id': 'Id','status': 'Predicted'}).to_csv('result.csv', index=False)

if graph is not None:
    Image(graph.create_png())

Processing...
0.6626473006303096
0.8558882417182381
0.881373330064959
0.8986959671576913
0.900523560209424
[[11  8]
 [11 86]]
[[ 24  22]
 [ 22 260]]
Done!
Estimated score: 82.79%
Estimated accuracy: 72.17%
Estimated precision: 81.85%
Estimated recall: 84.13%
Final result accuracy: 83.62%
Final result precision: 91.49%
Final result recall: 88.66%
Final result score: 90.05%
Complete accuracy: 86.59%
Complete precision: 92.20%
Complete recall: 92.20%
Complete score: 92.20%
4 [label=<no. of municipalities with inhabitants < 499  &le; 26.5<br/>gini = 0.37<br/>samples = 49<br/>value = [12, 37]<br/>class = 1>, fillcolor="#79bded"] ;
  ^
Expected "}", found '['  (at char 726), (line:11, col:3)
Graph failed!
     loan_id  status
0       6346       1
1       6913       1
2       6014       1
3       5117       1
4       5419       1
..       ...     ...
349     7200      -1
350     7171      -1
351     6470      -1
352     5933       1
353     5841       1

[354 rows x 2 columns]
