# Microsoft Capstone Classification Project

In [13]:
import catboost
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier as catclass
import io

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
import sys
import os.path
from pathlib import Path

home = str(Path.home()) # setting home directory


In [17]:
data_dir = os.path.join(home, 'Analysis','capstone')
train_values = pd.read_csv(os.path.join(data_dir, 'train_values.csv'))
test_values = pd.read_csv(os.path.join(data_dir, 'test_values.csv'))
train_labels = pd.read_csv(os.path.join(data_dir,'train_labels.csv'))


In [21]:
train_values.head(n = 2)

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,3,1,1,1,70.0,3,18,37,246,...,1,24.0,6203.0,44.23,60588.0,50.933,716.0,2642.0,4536,False
1,1,1,1,3,1,178.0,3,369,52,299,...,1,57.0,5774.0,15.905,54821.0,100.0,1622.0,2108.0,2458,False


In [22]:
train_labels.head(n=2)

Unnamed: 0,row_id,accepted
0,0,1
1,1,0


In [23]:
test_values.head(n=2)

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,2,1,3,1,115.0,3,101,16,276,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,True
1,1,1,1,1,1,252.0,2,87,20,68,...,1,107.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,True


In [24]:

#Make rows Strings
train_values["row_id"]=train_values["row_id"].astype("object")
test_values["row_id"]=test_values["row_id"].astype("object")

train_values["co_applicant"]=train_values["co_applicant"].astype("int64")
test_values["co_applicant"]=test_values["co_applicant"].astype("int64")

In [25]:
# Select numeric columns
numcols = ['population', 'minority_population_pct',
       'ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
       'number_of_owner-occupied_units', 'number_of_1_to_4_family_units']

numcols1 = ['loan_amount','applicant_income']

In [28]:
for col in numcols1:
    bins_train = [train_values[col].min(), train_values[col].quantile(.50), train_values[col].quantile(.75), train_values[col].max()]
    bins_test  = [test_values[col].min(), test_values[col].quantile(.50),test_values[col].quantile(.75), test_values[col].max()]
  
    train_values[col] = pd.cut(train_values[col],bins_train, labels =["1","2","3"]).astype("category")
    
    test_values[col] = pd.cut(test_values[col],bins_test, labels =["1","2","3"]).astype("category")

In [29]:
test_values.head()

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,2,1,3,1,1,3,101,16,276,...,1,,6329.0,59.536,69889.0,85.78,1874.0,2410.0,3791,1
1,1,1,1,1,1,2,2,87,20,68,...,1,2.0,2473.0,8.05,65313.0,100.0,947.0,1214.0,2839,1
2,2,1,1,1,1,3,1,-1,-1,-1,...,2,3.0,,,,,,,4701,0
3,3,2,1,1,1,2,2,376,20,11,...,2,1.0,4795.0,29.676,57766.0,100.0,1426.0,1765.0,2153,1
4,4,2,1,1,1,1,2,254,48,156,...,3,1.0,5246.0,5.11,63332.0,100.0,1452.0,2092.0,5710,0


In [30]:
train_values.head(n=2)

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,3,1,1,1,1,3,18,37,246,...,1,1,6203.0,44.23,60588.0,50.933,716.0,2642.0,4536,0
1,1,1,1,3,1,2,3,369,52,299,...,1,1,5774.0,15.905,54821.0,100.0,1622.0,2108.0,2458,0


In [31]:
for col in numcols1:
    train_values[col] = pd.to_numeric(train_values[col], errors='coerce')
    test_values[col] = pd.to_numeric(test_values[col], errors='coerce')

In [32]:
#Replace Missing Values for numeric columns with mean of each
for numcol in numcols:
    train_values[numcol] = train_values[numcol].fillna(-999)
     
    test_values[numcol] = test_values[numcol].fillna(-999)
  
    
    
for numcol in numcols1:
    train_values[numcol] = train_values[numcol].fillna(-1)
     
    test_values[numcol] = test_values[numcol].fillna(-1) 

In [34]:
train_values.isnull().sum()[0:10]

row_id           0
loan_type        0
property_type    0
loan_purpose     0
occupancy        0
loan_amount      0
preapproval      0
msa_md           0
state_code       0
county_code      0
dtype: int64

In [35]:
test_values_clean = test_values
train_values_clean =train_values

In [37]:
all_train = pd.merge(train_values_clean, train_labels, on="row_id", how="inner")

y_labs = train_labels["accepted"]

train_values_clean_1 = train_values_clean.loc[:, ~train_values_clean.columns.isin(['row_id'])]
test_values_clean_1 = test_values_clean.loc[:, ~test_values_clean.columns.isin(['row_id'])]

In [38]:
#Names of categorical columns in a list for looping on two datasets
catcols = ['loan_type', 'property_type', 'loan_purpose', 'occupancy','preapproval', 'msa_md', 'state_code', 'county_code',
       'applicant_ethnicity', 'applicant_race', 'lender','applicant_sex', 'co_applicant', 'loan_amount', 'applicant_income']

In [39]:
category_index =  [train_values_clean_1.columns.get_loc(c) for c in catcols if c!= "accepted"]; category_index

[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 19, 11, 20, 4, 12]

In [40]:
xtrain, xtest, ytrain, ytest = train_test_split(train_values_clean_1, y_labs, test_size = 0.22)

In [41]:
model_cat = CatBoostClassifier(iterations=1000, depth=8, learning_rate=0.4, eval_metric='Accuracy', use_best_model=True, random_seed=2019)
model_cat.fit(xtrain, ytrain, cat_features = category_index, eval_set = (xtest, ytest))

0:	learn: 0.7009385	test: 0.7018273	best: 0.7018273 (0)	total: 2.88s	remaining: 48m 1s
1:	learn: 0.7026692	test: 0.7037455	best: 0.7037455 (1)	total: 5.64s	remaining: 46m 56s
2:	learn: 0.7047974	test: 0.7047091	best: 0.7047091 (2)	total: 8.38s	remaining: 46m 25s
3:	learn: 0.7059615	test: 0.7061364	best: 0.7061364 (3)	total: 11.2s	remaining: 46m 36s
4:	learn: 0.7073231	test: 0.7080818	best: 0.7080818 (4)	total: 13.2s	remaining: 43m 45s
5:	learn: 0.7093282	test: 0.7104727	best: 0.7104727 (5)	total: 16.1s	remaining: 44m 24s
6:	learn: 0.7103923	test: 0.7103091	best: 0.7104727 (5)	total: 19.2s	remaining: 45m 18s
7:	learn: 0.7109923	test: 0.7117364	best: 0.7117364 (7)	total: 22.3s	remaining: 46m 9s
8:	learn: 0.7116692	test: 0.7124545	best: 0.7124545 (8)	total: 25s	remaining: 45m 50s
9:	learn: 0.7122974	test: 0.7134545	best: 0.7134545 (9)	total: 28.1s	remaining: 46m 20s
10:	learn: 0.7124641	test: 0.7139727	best: 0.7139727 (10)	total: 31.2s	remaining: 46m 42s
11:	learn: 0.7129923	test: 0.71433

<catboost.core.CatBoostClassifier at 0x1b281d7b898>

In [42]:
pred = model_cat.predict(test_values_clean_1)

In [43]:
import sklearn.metrics as metmod

In [44]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def key_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = metmod.precision_recall_fscore_support(labels, scores)
    conf = metmod.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    {}'.format(conf[0,0]) + '             {}'.format(conf[0,1]))
    print('Actual negative     {}'.format(conf[1,0]) + '             {}'.format(conf[1,1]))
    print('')
    print('Accuracy        {}'.format(round(metmod.accuracy_score(labels, scores), 2)))
    print('AUC              {}'.format(round(metmod.roc_auc_score(labels, probs[:,1]), 2)))
    print('Macro precision {}'.format(round(float((float(metrics[0][0]) + float(metrics[0][1]))/2.0), 2)))
    print('Macro recall    {}'.format(round(float((float(metrics[1][0]) + float(metrics[1][1]))/2.0), 2)))
    print(' ')
    print('           Positive      Negative')
    print('Num case   {}'.format(round(metrics[3][0], 2)) + '        {}'.format(round(metrics[3][1], 2)))
    print('Precision  {}'.format(round(metrics[0][0], 2)) + '         {}'.format(round(metrics[0][1], 2)))
    print('Recall      {}'.format(round(metrics[1][0],2)) + '        {}'.format(round(metrics[1][1], 2)))
    print('F1         {}'.format(round(metrics[2][0], 2)) + '         {}' .format(round(metrics[2][1],2)))
    

In [45]:
probabilities = model_cat.predict_proba(xtest)
key_metrics(ytest, probabilities, 0.5)  

                 Confusion matrix
                 Score positive    Score negative
Actual positive    36750             18402
Actual negative     12255             42593

Accuracy        0.72
AUC              0.8
Macro precision 0.72
Macro recall    0.72
 
           Positive      Negative
Num case   55152        54848
Precision  0.75         0.7
Recall      0.67        0.78
F1         0.71         0.74


In [47]:
pred_df  = pd.DataFrame({'row_id':test_values.row_id ,'accepted': pred})

pred_df.to_csv(os.path.join(data_dir,'predicted_2.csv'), index=False) 
#files.download('predicted.csv')