# Quick Model for Classifying Rejection.
I am going give to XGBoost a quick go to classify if a drug has been rejected

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

### Import data and split into train/test groups

In [2]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

### Feature Engineering

In [3]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

def pharm_split(text):
    return int(text.split('#')[1])

In [4]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')
cmm_train['pharm_num'] = cmm_train.pharmacy.apply(pharm_split)
cmm_train['tx_date'] = pd.to_datetime(cmm_train.tx_date)
cmm_train['weekday'] = cmm_train.tx_date.dt.day_of_week
cmm_train['day_num'] = cmm_train.tx_date.dt.day_of_year

#### Encoding Categorical Variables with Label Encoder

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

names are transformed to numerical data above. below we can transform numerical data back to a name.

In [7]:
le_name.inverse_transform([19])

array(['foxivelule'], dtype=object)

In [8]:
le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

In [9]:
le_diagnosis.inverse_transform([66])

array(['M31.63'], dtype=object)

In [10]:
le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

In [11]:
le_bin.inverse_transform([2])

array([322463])

In [12]:
le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [13]:
le_pcn.inverse_transform([4])

array(['3O71UTS'], dtype=object)

### What do we have?

In [14]:
cmm_train.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'pharm_num', 'weekday', 'day_num', 'name_encoded',
       'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
      dtype='object')

In [15]:
cmm_train.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,...,1,23539,4480,29,5,113,38,66,2,4
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,...,0,43263,14072,7,6,37,75,127,8,2
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,...,1,367822,0,4,2,285,19,101,2,4
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,...,0,121908,8960,31,1,284,56,59,5,44
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,...,1,144501,7380,6,1,291,61,37,0,34


In [16]:
print('The number of rejected claims is', cmm_train['rejected'].sum())
print('The total number of claims is', len(cmm_train))
print('The percentage of rejected claims is', cmm_train['rejected'].sum()/len(cmm_train)*100, '%')

The number of rejected claims is 839691
The total number of claims is 10049476
The percentage of rejected claims is 8.355569981957268 %


The rejected category is imbalanced so some consideration should be given to the metrics used to score the classifier.

In [17]:
base_features = ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 
                 'private', 'generic', 'name_encoded']
pharm_features = ['pharm_num', 'diagnosis_encoded', 'bin_encoded', 
                  'pcn_encoded', 'private', 'generic', 'name_encoded']
day_features = ['weekday', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']
full_features = ['weekday', 'day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']

### Importing the HistGradientBoostingClassifier
This should be faster than XGBoost for large datasets.

Using cross validation with recall, precision, and accuracy for metrics

In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [19]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)
clf = HistGradientBoostingClassifier()

In [20]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(cmm_train):
    clf.fit(cmm_train[base_features].iloc[tt_index], cmm_train['rejected'].iloc[tt_index])
    predictions = clf.predict(cmm_train[base_features].iloc[ho_index])
    metrics[i, 0] = recall_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(cmm_train['rejected'].iloc[ho_index], predictions)
    i+=1

In [21]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 3.932275284816957e-05
average precision is 0.22924972369995816
average accuracy is 0.9164352449541298


The first go at classification of rejected claims failed to give any useful information

High accuracy with low precision and recall implies most things are being classified as not rejected

### Let's try Down Sampling

In [22]:
from sklearn.utils import resample

In [23]:
temp = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=len(cmm_train[cmm_train['rejected']==1]))

In [24]:
downsample = pd.concat([cmm_train[cmm_train['rejected']==1], temp])

In [25]:
downsample

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
8270129,2022-08-14,Pharmacy #15,P07.55,branded semufolic,664344,YFVIA,AJK5MZ25T9IA,1,0.00,0,...,0,237813,41930,15,6,226,60,79,5,44
7009332,2022-07-14,Pharmacy #14,Y51.55,generic sorine,160389,RB7UU,RS5RB3YA,1,0.00,1,...,0,115708,13891,14,3,195,63,124,1,31
1316687,2022-02-10,Pharmacy #37,Z20.23,branded vivafastat,160389,RB7UU,RS5RB3YA,1,0.00,0,...,0,43263,14072,37,3,41,75,127,1,31
9652363,2022-09-17,Pharmacy #7,I38.43,branded colifunene,725700,,DYGBI610ZY,1,0.00,0,...,0,179278,14364,7,5,260,7,49,8,48
7415161,2022-07-23,Pharmacy #43,H36.57,branded semufolic,725700,9C5MOR3,S2QKZ0OFNWS6X,1,0.00,0,...,0,237813,41930,43,5,204,60,41,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407331,2022-01-14,Pharmacy #30,W50.87,generic tanoclolol,725700,327CKV,IOEAN1DWVV3Y,0,6.50,1,...,0,381064,0,30,4,14,67,115,8,2
958726,2022-01-28,Pharmacy #47,S15.62,branded oxasoted,571569,W7L3,V96T9QL5,0,8.80,0,...,0,226597,0,47,4,28,48,92,4,41
9458353,2022-09-13,Pharmacy #37,N48.90,branded bovirol,664344,K5KDJ7G,1N5IRQ,0,19.82,0,...,0,158499,2376,37,1,256,1,74,5,20
11095428,2022-10-21,Pharmacy #44,P07.55,generic colade,322463,3O71UTS,,0,7.61,1,...,1,95485,85,44,4,294,6,79,2,4


In [26]:
len(downsample)

1679382

In [26]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[base_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[base_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [27]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9772685038894913
average precision is 0.7969731650716227
average accuracy is 0.8641565759622676


Added the pharmacy and day of week as features in this model

In [28]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[day_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[day_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [29]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9774287009551322
average precision is 0.7966661326544616
average accuracy is 0.8639797257066422


full feature set

In [30]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[full_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[full_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [31]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9775001584202838
average precision is 0.797086335718706
average accuracy is 0.8643292589064743


This is somewhat promising. Below a classifier trained on the under sampled data will be used to predict on the full training data set.

In [32]:
from sklearn.metrics import confusion_matrix

base features

In [33]:
clf.fit(downsample[base_features], downsample['rejected'])
pred_down = clf.predict(cmm_train[base_features])

print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.9781312411351318
precision is 0.2628161293780529
accuracy is 0.768929245664152


In [34]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6906008,2303777
Actual 1,18363,821328


In [37]:
prob_rejected = clf.predict_proba(cmm_train[base_features])[:,1]

In [38]:
cmm_with_preds = cmm_train.copy()
cmm_with_preds['probs_base'] = prob_rejected

In [39]:
cmm_with_preds

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded,probs_base
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,...,23539,4480,29,5,113,38,66,2,4,0.729853
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,...,43263,14072,7,6,37,75,127,8,2,0.835070
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,...,367822,0,4,2,285,19,101,2,4,0.001492
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,...,121908,8960,31,1,284,56,59,5,44,0.011734
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,...,144501,7380,6,1,291,61,37,0,34,0.004682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12922697,2022-12-05,Pharmacy #43,G99.93,branded pucomalol,664344,,52H8KH0F83K,0,14.14,0,...,417336,10707,43,0,339,54,37,5,48,0.276727
2681870,2022-03-19,Pharmacy #33,P07.55,branded tricatripride,757349,MSCXSG,DGLGRYP,1,0.00,0,...,31737,6996,33,5,78,73,79,10,24,0.740330
3518530,2022-04-13,Pharmacy #44,G99.93,generic tanoclolol,664344,TPJD,,0,6.56,1,...,381064,0,44,2,103,67,37,5,38,0.002497
9525850,2022-09-14,Pharmacy #13,K32.86,branded tafistitrisin,96934,9D24,VC81HUO7ZD,0,10.17,0,...,139317,933,13,2,257,66,57,0,9,0.033269


train on features that include day of week.

In [40]:
clf.fit(downsample[day_features], downsample['rejected'])
pred_down = clf.predict(cmm_train[day_features])

In [41]:
print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.979282855240797
precision is 0.2633084785013068
accuracy is 0.7693377246734059


In [42]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6909146,2300639
Actual 1,17396,822295


In [43]:
prob_rejected = clf.predict_proba(cmm_train[day_features])[:,1]
cmm_with_preds['probs_day'] = prob_rejected
cmm_with_preds.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded,probs_base,probs_day
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,...,4480,29,5,113,38,66,2,4,0.729853,0.713333
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,...,14072,7,6,37,75,127,8,2,0.83507,0.835485
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,...,0,4,2,285,19,101,2,4,0.001492,0.001079
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,...,8960,31,1,284,56,59,5,44,0.011734,0.011886
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,...,7380,6,1,291,61,37,0,34,0.004682,0.004394


How many features are needed to make a decent classifier

In [58]:
def examine_features(features_list):
    clf.fit(downsample[features_list], downsample['rejected'])
    pred_down = clf.predict(cmm_train[features_list])

    print(features_list,'\n', 'recall:', recall_score(cmm_train['rejected'], pred_down), 
          'precision:', precision_score(cmm_train['rejected'], pred_down),
          'accuracy:', accuracy_score(cmm_train['rejected'], pred_down), '\n')
    return

In [60]:
features = [['name_encoded'], ['name_encoded', 'diagnosis_encoded'],
            ['name_encoded', 'diagnosis_encoded', 'generic'],
            ['name_encoded', 'diagnosis_encoded', 'bin_encoded'], 
            ['name_encoded', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
            ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 'generic', 'name_encoded'],
            ['weekday', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded'],
            ['day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded'],
            ['weekday', 'diagnosis_encoded', 'bin_encoded', 'private', 'generic', 'name_encoded'],
            ['weekday', 'day_num', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 
             'generic', 'name_encoded'],
            ['weekday', 'day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 
             'private', 'generic', 'name_encoded']
           ]

for feature_list in features:
    examine_features(feature_list)

['name_encoded'] 
 recall: 0.8798962951847763 precision: 0.25709843560734047 accuracy: 0.7775231265789381 

['name_encoded', 'diagnosis_encoded'] 
 recall: 0.8860592765672134 precision: 0.25321532300935856 accuracy: 0.7721340893793865 

['name_encoded', 'diagnosis_encoded', 'generic'] 
 recall: 0.9183985537537023 precision: 0.2579381166063272 accuracy: 0.7724158951173176 

['name_encoded', 'diagnosis_encoded', 'bin_encoded'] 
 recall: 0.920395716995895 precision: 0.2446209212596182 accuracy: 0.755871350904266 

['name_encoded', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'] 
 recall: 0.9292537373867291 precision: 0.24781195552961188 accuracy: 0.7584131749754912 

['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 'generic', 'name_encoded'] 
 recall: 0.9788052986158003 precision: 0.26198830782177396 accuracy: 0.7678443134746528 

['weekday', 'pharm_num', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 'generic', 'name_encoded'] 
 recall: 0.977708466566868 p

In [61]:
examine_features(['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 
                  'rejected_count', 'private', 'generic', 'name_encoded'] )

['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'rejected_count', 'private', 'generic', 'name_encoded'] 
 recall: 0.9887720601983349 precision: 0.26490867693548614 accuracy: 0.7698075999186426 



Downsampling looks to be an efficient way to train the classifier to predict rejected claims but reduces the training data for non-rejected claims. Perhaps upsampling will fix that problem.

In [39]:
number = int(len(cmm_train)*0.5)

In [40]:
minority = resample(cmm_train[cmm_train['rejected']==1], 
                replace=True, 
                n_samples=number)

majority = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=number)

oversample = pd.concat([majority, minority])
oversample.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
4289270,2022-05-04,Pharmacy #49,I68.27,branded hidizuzunib,757349,MSCXSG,DGLGRYP,0,85.1,0,...,0,334720,110660,49,2,124,29,52,10,24
8027369,2022-08-09,Pharmacy #49,U27.71,branded colifunene,96934,9D24,VC81HUO7ZD,0,10.02,0,...,0,179278,14364,49,1,221,7,100,0,9
1946560,2022-02-26,Pharmacy #22,Q85.91,branded rulfalol,664344,CS8580,,0,5.87,0,...,1,531018,0,22,5,57,57,89,5,15
12094467,2022-11-15,Pharmacy #36,G99.93,branded dienulol,96934,S76J7V6,,0,13.93,0,...,1,323566,0,36,1,319,14,37,0,34
4833958,2022-05-17,Pharmacy #8,Q72.66,generic ratin,664344,,52H8KH0F83K,0,6.11,1,...,0,262785,12316,8,1,137,55,86,5,48


In [None]:
metrics_over = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(oversample):
    clf.fit(oversample[base_features].iloc[tt_index], oversample['rejected'].iloc[tt_index])
    predictions = clf.predict(oversample[base_features].iloc[ho_index])
    metrics_over[i, 0] = recall_score(oversample['rejected'].iloc[ho_index], predictions)
    metrics_over[i, 1] = precision_score(oversample['rejected'].iloc[ho_index], predictions)
    metrics_over[i, 2] = accuracy_score(oversample['rejected'].iloc[ho_index], predictions)
    i+=1

In [None]:
print('average recall is', metrics_over.mean(axis=0)[0])
print('average precision is', metrics_over.mean(axis=0)[1])
print('average accuracy is', metrics_over.mean(axis=0)[2])

In [37]:
clf.fit(oversample[base_features], oversample['rejected'])
pred_over = clf.predict(cmm_train[base_features])

print('recall is',recall_score(cmm_train['rejected'], pred_over))
print('precision is',precision_score(cmm_train['rejected'], pred_over))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_over))

recall is 0.9770487000575212
precision is 0.263040127118155
accuracy is 0.7693570291625156


In [38]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_over)
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6911216,2298569
Actual 1,19272,820419


# Let's Compare to some other Algorithms

Adaboost and perhaps randomforest

In [40]:
from sklearn.ensemble import AdaBoostClassifier

In [41]:
ada_class = AdaBoostClassifier(n_estimators=100, random_state=0)

In [29]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    ada_class.fit(downsample[base_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = ada_class.predict(downsample[base_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

NameError: name 'downsample' is not defined

In [45]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.8758974919919822
average precision is 0.7827331190677065
average accuracy is 0.8163854313882792


Train adaboost classifier on the downsampled data but predict on the train set

In [47]:
ada_class.fit(downsample[base_features], downsample['rejected'])
pred_down = ada_class.predict(cmm_train[base_features])

In [48]:
print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.8768499364647233
precision is 0.24978271150261866
accuracy is 0.7696577413588529


In [49]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)

In [50]:
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6998374,2211411
Actual 1,103408,736283
