In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
sns.set(color_codes = True)
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
#matplotlib inline

import pylab 
import scipy.stats as stats

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score
from sklearn.metrics import classification_report 

In [3]:
data=pd.read_csv(r"LOL.csv",error_bad_lines=False,parse_dates=True,warn_bad_lines=False)

In [4]:
data.shape

(193482, 150)

In [5]:
d_copy = data.copy(True)
data.replace(['\\N','NaT'],np.nan,inplace=True)

In [6]:
data['is_valid'] = np.where(((data['fk_lu_invalid_deduction_identifier_id'].isnull() == False)  | (data['correspondence_flag'] == 1)),0,1)

In [7]:
outcols=['fk_lu_invalid_deduction_identifier_id','correspondence_flag','fk_action_code_id','fk_resolution_type_id',"is_valid","vendor_id"]

In [8]:
st_cols=[]
for i in data.columns:
    if(data[i].nunique())==1:
        st_cols.append(i)
result = [x for x in st_cols if x not in outcols]
data.drop(result,axis=1,inplace=True)

In [9]:
a = data.isnull().sum()
a = a.drop(outcols)
b = a[a>(0.8*data.shape[0])].index
data.drop(b,axis=1,inplace=True)

In [10]:
data['is_valid'].value_counts()

1    182722
0     10760
Name: is_valid, dtype: int64

In [11]:
data['pk_deduction_id'].nunique()

193470

In [12]:
data['pk_deduction_id']= pd.to_numeric(data['pk_deduction_id'], errors='coerce')
data['fk_account_id']= pd.to_numeric(data['fk_account_id'], errors='coerce')
data['fk_customer_map_id']= pd.to_numeric(data['fk_customer_map_id'], errors='coerce')
data['fk_reason_code_map_id']= pd.to_numeric(data['fk_reason_code_map_id'], errors='coerce')
data['fk_action_code_id']= pd.to_numeric(data['fk_action_code_id'], errors='coerce')
data['fk_document_type_map_id']= pd.to_numeric(data['fk_document_type_map_id'], errors='coerce')
data['fk_deduction_type_id']= pd.to_numeric(data['fk_deduction_type_id'], errors='coerce')
data['fk_resolution_type_id']= pd.to_numeric(data['fk_resolution_type_id'], errors='coerce')

In [13]:
data['pk_deduction_id'].isnull().sum()
data = data[pd.notnull(data['pk_deduction_id'])]
data.shape

(193464, 43)

In [14]:
data = data[pd.notnull(data['fk_customer_map_id'])]
data.shape

(193445, 43)

In [15]:
data = data[pd.notnull(data['original_dispute_amount']>0)]
data.shape

(193445, 43)

In [16]:
data['deduction_created_date']=pd.to_datetime(data['deduction_created_date'])
data['deduction_closed_date']=pd.to_datetime(data['deduction_closed_date'])

In [17]:
data.sort_values(by='deduction_created_date',inplace=True)

In [18]:
data.tail(3)

Unnamed: 0,pk_deduction_id,fk_account_id,fk_customer_map_id,fk_reason_category_id,fk_reason_code_map_id,fk_action_code_id,fk_document_type_map_id,company_code,ext_reference,current_dispute_amount,original_dispute_amount,original_dispute_number,reference_number2,reference_number4,reference_number10,deduction_closed_date,deduction_created_date,vendor_id,fk_deduction_type_id,accounting_doc_number,payer,check_number,posting_date,merge_status,invoice_link_status,correspondence_flag,comments,ar_reason_code,check_date,cal_credited_amount,cal_write_off_amount,cal_repay_amount,cal_promotion_settlement_amount,cal_reinstate_amount,cal_unresolved_amount,check_amount,fund_number,fk_lu_invalid_deduction_identifier_id,reference_number12,reference_number14,reference_number15,fk_resolution_type_id,is_valid
193409,29811025.0,263.0,95104170.0,4,1839.0,543.0,-1.0,1,157635-BB,0,1.5,1708699,FSV,64407000034500,2194179,2019-01-18 20:38:09,2019-01-18 00:00:00,,0.0,1708699,1474427,739430,2019-01-18,0,0,0,,ZM,2019-01-17,0,1.5,0,0,0,0,23736.6,,,JDE,1000 - TOTAL FOODSERVICE,4055 - WAYPOINT CENTRAL STREET,,1
193481,29811154.0,263.0,97537202.0,174,1842.0,543.0,-1.0,1,3781421,0,93.73,1708565,MIC,77105000042634,2597867,2019-01-18 20:39:16,2019-01-18 00:00:00,,0.0,1708565,2586666,2586666 011819,2019-01-18,0,0,0,,ZS,2019-01-18,0,93.73,0,0,0,0,0.0,,,JDE,1002 - TOTAL RETAIL FOODS,4038 - AMAZON_E,,1
193367,29810948.0,263.0,95101302.0,4,1854.0,831.0,-1.0,1,SPDN010200,0,18.0,1696491,RET,53000000073491,1497434,2019-01-18 09:41:15,2019-01-18 09:39:42,,0.0,1696491,1998474,1998474 112018,2018-11-20,0,0,0,emailed Jim,ZM,2018-11-20,0,18.0,0,0,0,0,9557.21,,,JDE,1002 - TOTAL RETAIL FOODS,4064 - WEST AM,,1


In [19]:
test_set=data[data['deduction_created_date']>='2018-9-18']
train_val_set=data[data['deduction_created_date']<'2018-9-18']

In [20]:
val_set=train_val_set[train_val_set['deduction_created_date']>='2018-06-18']
train_set=train_val_set[train_val_set['deduction_created_date']<'2018-06-18']

In [21]:
print("Size of validation set is:",val_set.shape )
print("Size of Training set is:",train_set.shape )
print("Size of Test Set is:",test_set.shape)

Size of validation set is: (17524, 43)
Size of Training set is: (153866, 43)
Size of Test Set is: (22055, 43)


In [22]:
data_copy=data.copy()
data_copy.shape

(193445, 43)

In [23]:
train_set['original_dispute_amount']= pd.to_numeric(train_set['original_dispute_amount'], errors='coerce')
val_set['original_dispute_amount']= pd.to_numeric(val_set['original_dispute_amount'], errors='coerce')
test_set['original_dispute_amount']= pd.to_numeric(test_set['original_dispute_amount'], errors='coerce')

In [24]:
train_set['original_dispute_amount_log']=np.log(train_set["original_dispute_amount"])
val_set['original_dispute_amount_log']=np.log(val_set["original_dispute_amount"])
test_set['original_dispute_amount_log']=np.log(test_set["original_dispute_amount"])

In [25]:
train_set['fk_customer_map_id'].nunique()

632

### Feature

#### FK_CUSTOMER_MAP_ID_HISTORY

In [26]:
valid_deduction=train_set.groupby('fk_customer_map_id')['is_valid'].sum()
total_deduction=train_set.groupby('fk_customer_map_id')['is_valid'].count()
invalid_deduction=total_deduction-valid_deduction

In [27]:
train_set['fk_customer_map_id_hist']=train_set['fk_customer_map_id'].map(invalid_deduction/total_deduction)
val_set['fk_customer_map_id_hist']=val_set['fk_customer_map_id'].map(invalid_deduction/total_deduction)
test_set['fk_customer_map_id_hist']=test_set['fk_customer_map_id'].map(invalid_deduction/total_deduction)

In [28]:
train_set['fk_customer_map_id_hist'].fillna(train_set['fk_customer_map_id_hist'].mean(),inplace=True)
test_set['fk_customer_map_id_hist'].fillna(train_set['fk_customer_map_id_hist'].mean(),inplace=True)
val_set['fk_customer_map_id_hist'].fillna(train_set['fk_customer_map_id_hist'].mean(),inplace=True)

#### AR REASON CODE HISTORY

In [29]:
ar_total=train_set.groupby('ar_reason_code')['is_valid'].count()
ar_valid=train_set.groupby('ar_reason_code')['is_valid'].sum()
ar_invalid=ar_total-ar_valid

In [30]:
train_set['ar_reason_code_hist']=train_set['ar_reason_code'].map(ar_invalid/ar_total)
val_set['ar_reason_code_hist']=val_set['ar_reason_code'].map(ar_invalid/ar_total)
test_set['ar_reason_code_hist']=test_set['ar_reason_code'].map(ar_invalid/ar_total)

In [31]:
train_set['ar_reason_code_hist'].fillna(train_set['ar_reason_code_hist'].mean(),inplace=True)
test_set['ar_reason_code_hist'].fillna(train_set['ar_reason_code_hist'].mean(),inplace=True)
val_set['ar_reason_code_hist'].fillna(train_set['ar_reason_code_hist'].mean(),inplace=True)

#### REFERENCE ID 15 HISTORY

In [32]:
train_set['reference_number15'].fillna("others",inplace=True)
val_set['reference_number15'].fillna("others",inplace=True)
test_set['reference_number15'].fillna("others",inplace=True)

In [33]:
total_df=train_set.groupby(['fk_customer_map_id','reference_number15'])['is_valid'].agg(['count']).reset_index()
valid_df=train_set.groupby(['fk_customer_map_id','reference_number15'])['is_valid'].agg(['sum']).reset_index()

train_set=pd.merge(train_set,total_df,on=['fk_customer_map_id','reference_number15'], how='left')
train_set=pd.merge(train_set,valid_df,on=['fk_customer_map_id','reference_number15'], how='left')

train_set.rename(columns={'count':'total_deductions_ref','sum':'valid_deductions_ref'},inplace=True)
train_set['invalid_deductions_ref']=train_set['total_deductions_ref']-train_set['valid_deductions_ref']

val_set=pd.merge(val_set,total_df,on=['fk_customer_map_id','reference_number15'], how='left')
val_set=pd.merge(val_set,valid_df,on=['fk_customer_map_id','reference_number15'], how='left')
val_set.rename(columns={'count':'total_deductions_ref','sum':'valid_deductions_ref'},inplace=True)
val_set['invalid_deductions_ref']=val_set['total_deductions_ref']-val_set['valid_deductions_ref']

test_set=pd.merge(test_set,total_df,on=['fk_customer_map_id','reference_number15'], how='left')
test_set=pd.merge(test_set,valid_df,on=['fk_customer_map_id','reference_number15'], how='left')
test_set.rename(columns={'count':'total_deductions_ref','sum':'valid_deductions_ref'},inplace=True)
test_set['invalid_deductions_ref']=test_set['total_deductions_ref']-test_set['valid_deductions_ref']

In [34]:
train_set['ref_num15_fk_customer_map_id_history']=train_set['invalid_deductions_ref']/train_set['total_deductions_ref']
val_set['ref_num15_fk_customer_map_id_history']=val_set['invalid_deductions_ref']/val_set['total_deductions_ref']
test_set['ref_num15_fk_customer_map_id_history']=test_set['invalid_deductions_ref']/test_set['total_deductions_ref']

train_set['ref_num15_fk_customer_map_id_history'].fillna(train_set['ref_num15_fk_customer_map_id_history'].mean(),inplace=True)
val_set['ref_num15_fk_customer_map_id_history'].fillna(val_set['ref_num15_fk_customer_map_id_history'].mean(),inplace=True)
test_set['ref_num15_fk_customer_map_id_history'].fillna(test_set['ref_num15_fk_customer_map_id_history'].mean(),inplace=True)

### B_VALUE

In [35]:
train_set['total_number_deduction']=train_set['fk_customer_map_id'].map(total_deduction)
val_set['total_number_deduction']=val_set['fk_customer_map_id'].map(total_deduction)
test_set['total_number_deduction']=test_set['fk_customer_map_id'].map(total_deduction)

In [36]:
train_set['cust_label']=np.where(train_set['total_number_deduction']>500,1,0) 
val_set['cust_label']=np.where(val_set['total_number_deduction']>500,1,0) 
test_set['cust_label']=np.where(test_set['total_number_deduction']>500,1,0)

In [37]:
unique_cust_id = train_set['fk_customer_map_id'].unique().tolist()
train_set['avg_invalid_ded_amt']=0
m=[]

for k in unique_cust_id:
    temp=pd.DataFrame()
    temp=train_set.loc[(train_set['fk_customer_map_id']==k) & (train_set['is_valid']==0), :]
    temp = temp.reset_index()
    avg=temp['original_dispute_amount'].mean()
    m.append(avg)


avg_amt=dict(zip(unique_cust_id,m))
train_set['avg_invalid_ded_amt']=train_set['fk_customer_map_id'].map(avg_amt)
val_set['avg_invalid_ded_amt']=val_set['fk_customer_map_id'].map(avg_amt)
test_set['avg_invalid_ded_amt']=test_set['fk_customer_map_id'].map(avg_amt)

In [38]:
train_set['avg_invalid_ded_amt'].fillna(train_set['fk_customer_map_id'].mean(),inplace=True)
val_set['avg_invalid_ded_amt'].fillna(val_set['fk_customer_map_id'].mean(),inplace=True)
test_set['avg_invalid_ded_amt'].fillna(test_set['fk_customer_map_id'].mean(),inplace=True)

In [39]:
train_set['b_value']=np.where(train_set['cust_label']==1,train_set['avg_invalid_ded_amt']/train_set['original_dispute_amount'],train_set['original_dispute_amount']/train_set['avg_invalid_ded_amt'])
val_set['b_value']=np.where(val_set['cust_label']==1,val_set['avg_invalid_ded_amt']/val_set['original_dispute_amount'],val_set['original_dispute_amount']/val_set['avg_invalid_ded_amt'])
test_set['b_value']=np.where(test_set['cust_label']==1,test_set['avg_invalid_ded_amt']/test_set['original_dispute_amount'],test_set['original_dispute_amount']/test_set['avg_invalid_ded_amt'])

In [40]:
train_set['b_value']=np.where(train_set['b_value']<1,1,train_set['b_value']) 
val_set['b_value']=np.where(val_set['b_value']<1,1,val_set['b_value'])
test_set['b_value']=np.where(test_set['b_value']<1,1,test_set['b_value'])

In [41]:
train_set['b_value']=np.where(train_set['b_value']>50,50,train_set['b_value']) 
val_set['b_value']=np.where(val_set['b_value']>50,50,val_set['b_value'])
test_set['b_value']=np.where(test_set['b_value']>50,50,test_set['b_value'])

In [42]:
train_set['avg_invalid_ded_amt'].isnull().sum()

0

In [43]:
train_set['month_of_deduction']=train_set['deduction_created_date'].dt.month
test_set['month_of_deduction']=test_set['deduction_created_date'].dt.month
val_set['month_of_deduction']=val_set['deduction_created_date'].dt.month

In [44]:
total=train_set.groupby('payer')['is_valid'].count()
valid=train_set.groupby('payer')['is_valid'].sum()
invalid=total-valid

In [45]:
train_set['payer_invalid_ratio']=train_set['payer'].map(invalid/total)
val_set['payer_invalid_ratio']=val_set['payer'].map(invalid/total)
test_set['payer_invalid_ratio']=test_set['payer'].map(invalid/total)

In [46]:
train_set['payer_invalid_ratio'].fillna(train_set['payer_invalid_ratio'].mean(),inplace=True)
val_set['payer_invalid_ratio'].fillna(train_set['payer_invalid_ratio'].mean(),inplace=True)
test_set['payer_invalid_ratio'].fillna(train_set['payer_invalid_ratio'].mean(),inplace=True)

In [47]:
train_set['invalid_propensity']=train_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean'])/(train_set.groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean']))
val_set['invalid_propensity']=val_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean'])/(train_set.groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean']))
test_set['invalid_propensity']=test_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean'])/(train_set.groupby('fk_customer_map_id')['original_dispute_amount'].agg(['mean'])['mean']))
train_set['invalid_propensity'].fillna(train_set['invalid_propensity'].mean(),inplace=True)
val_set['invalid_propensity'].fillna(train_set['invalid_propensity'].mean(),inplace=True)
test_set['invalid_propensity'].fillna(train_set['invalid_propensity'].mean(),inplace=True)

### Minimum and Maximum amount per  Customer

In [48]:
train_set['max_invalid_amount']=train_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['max'])['max']))
test_set['max_invalid_amount']=test_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['max'])['max']))
val_set['max_invalid_amount']=val_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['max'])['max']))


In [49]:
train_set['max_invalid_amount'].fillna(train_set['max_invalid_amount'].mean(),inplace=True)
test_set['max_invalid_amount'].fillna(train_set['max_invalid_amount'].mean(),inplace=True)
val_set['max_invalid_amount'].fillna(train_set['max_invalid_amount'].mean(),inplace=True)


In [50]:
train_set['min_invalid_amount']=train_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['min'])['min']))
test_set['min_invalid_amount']=test_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['min'])['min']))
val_set['min_invalid_amount']=val_set['fk_customer_map_id'].map((train_set.query('is_valid==0').groupby('fk_customer_map_id')['original_dispute_amount'].agg(['min'])['min']))


In [51]:
train_set['min_invalid_amount'].fillna(train_set['min_invalid_amount'].mean(),inplace=True)
test_set['min_invalid_amount'].fillna(train_set['min_invalid_amount'].mean(),inplace=True)
val_set['min_invalid_amount'].fillna(train_set['min_invalid_amount'].mean(),inplace=True)


In [52]:
features=['original_dispute_amount','b_value','ref_num15_fk_customer_map_id_history','ar_reason_code_hist','fk_customer_map_id_hist','invalid_propensity','max_invalid_amount','min_invalid_amount']

In [53]:
X_train=train_set[features]
Y_train=train_set['is_valid']
X_test=test_set[features]
Y_test=test_set['is_valid']
X_val=val_set[features]
Y_val=val_set['is_valid']

In [54]:
X_train=pd.concat([X_train,X_val])
Y_train=pd.concat([Y_train,Y_val])

In [55]:
X_train.isnull().sum()

original_dispute_amount                 0
b_value                                 0
ref_num15_fk_customer_map_id_history    0
ar_reason_code_hist                     0
fk_customer_map_id_hist                 0
invalid_propensity                      0
max_invalid_amount                      0
min_invalid_amount                      0
dtype: int64

## Random Forest

In [56]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
            n_estimators=200,
            class_weight={0:3,1:1},
            max_features=0.7,
            max_depth=7,
            min_samples_split=700,
            random_state=8
        )
model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight={0: 3, 1: 1},
            criterion='gini', max_depth=7, max_features=0.7,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=700, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=None, oob_score=False, random_state=8,
            verbose=0, warm_start=False)

In [57]:
y_pred1_rf=model.predict(X_test)
print("Confusion Matrix: ",confusion_matrix(Y_test, y_pred1_rf)) 
print("Accuracy using random forest: ",accuracy_score(Y_test,y_pred1_rf)*100)
print("Recall Score using random forest: ", recall_score(Y_test, y_pred1_rf)) 
print("classification report")
print(classification_report(Y_test, y_pred1_rf)) 

Confusion Matrix:  [[  128   203]
 [  731 20993]]
Accuracy using random forest:  95.76513262298798
Recall Score using random forest:  0.9663505800036826
classification report
              precision    recall  f1-score   support

           0       0.15      0.39      0.22       331
           1       0.99      0.97      0.98     21724

   micro avg       0.96      0.96      0.96     22055
   macro avg       0.57      0.68      0.60     22055
weighted avg       0.98      0.96      0.97     22055



In [361]:
l=model.predict_proba(X_test[features])
thres=0.9
y_pred1_rf=l[:,1]>thres

In [362]:
def proba_(pred_pro1,x_test1,y_test1):  
    y_t_rf=[None]*pred_pro1.shape[0]
    for i in range(0,pred_pro1.shape[0]):
        if(pred_pro1[i][1]>0.9 and x_test1.iloc[i]['original_dispute_amount']<10000):
                y_t_rf[i]=1
        else:
               y_t_rf[i]=0
    return y_t_rf

In [363]:
thresh_proba=proba_(l,X_test,Y_test)

In [364]:
print("Confusion Matrix: ",confusion_matrix(Y_test, thresh_proba)) 
print ("Accuracy:",accuracy_score(Y_test,thresh_proba)*100) 
print("Recall Score: ", recall_score(Y_test, thresh_proba)) 
print("classification report")
print(classification_report(Y_test, thresh_proba)) 

Confusion Matrix:  [[  306    25]
 [ 6122 15603]]
Accuracy: 72.13003264417846
Recall Score:  0.718204833141542
classification report
              precision    recall  f1-score   support

           0       0.05      0.92      0.09       331
           1       1.00      0.72      0.84     21725

   micro avg       0.72      0.72      0.72     22056
   macro avg       0.52      0.82      0.46     22056
weighted avg       0.98      0.72      0.82     22056



In [365]:
a=zip(features,model.feature_importances_)
a=list(a)
srt=sorted(a, key = lambda x: x[1])
srt

[('max_invalid_amount', 0.00842538404051156),
 ('invalid_propensity', 0.010720330177079051),
 ('b_value', 0.020910899796701507),
 ('min_invalid_amount', 0.02780699780645524),
 ('fk_customer_map_id_hist', 0.11151185241156293),
 ('ar_reason_code_hist', 0.1385579392295352),
 ('original_dispute_amount', 0.18137054711993003),
 ('ref_num15_fk_customer_map_id_history', 0.5006960494182242)]

## LGBM

In [339]:
from lightgbm import LGBMClassifier
lgbm=LGBMClassifier(boosting_type='gbdt',max_depth=5,n_estimators=200,learning_rate=0.07,class_weight={0:3,1:1})

In [340]:
lgbm.fit(X_train,Y_train)

LGBMClassifier(boosting_type='gbdt', class_weight={0: 3, 1: 1},
        colsample_bytree=1.0, importance_type='split', learning_rate=0.07,
        max_depth=5, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=200, n_jobs=-1, num_leaves=31,
        objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [341]:
y_pred1_lgbm=lgbm.predict(X_test[features])
print("Confusion Matrix: ",confusion_matrix(Y_test, y_pred1_lgbm)) 
print("Accuracy using random forest: ",accuracy_score(Y_test,y_pred1_lgbm)*100)
print("Recall Score using random forest: ", recall_score(Y_test, y_pred1_lgbm)) 
print("classification report")
print(classification_report(Y_test, y_pred1_lgbm)) 

Confusion Matrix:  [[  135   196]
 [  510 21215]]
Accuracy using random forest:  96.79905694595575
Recall Score using random forest:  0.9765247410817031
classification report
              precision    recall  f1-score   support

           0       0.21      0.41      0.28       331
           1       0.99      0.98      0.98     21725

   micro avg       0.97      0.97      0.97     22056
   macro avg       0.60      0.69      0.63     22056
weighted avg       0.98      0.97      0.97     22056



In [342]:
def proba_(pred_pro1,x_test1,y_test1):  
    y_t_lgbm=[None]*pred_pro1.shape[0]
    for i in range(0,pred_pro1.shape[0]):
        if(pred_pro1[i][1]>=0.9 and x_test1.iloc[i]['original_dispute_amount']<=10000):
                y_t_lgbm[i]=1
        else:
               y_t_lgbm[i]=0
    return y_t_lgbm

In [343]:
l=lgbm.predict_proba(X_test[features])

In [344]:
thresh_proba=proba_(l,X_test,Y_test)

In [345]:
print("Confusion Matrix: ",confusion_matrix(Y_test, thresh_proba)) 
print ("Accuracy:",accuracy_score(Y_test,thresh_proba)*100) 
print("Recall Score: ", recall_score(Y_test, thresh_proba)) 
print("classification report")
print(classification_report(Y_test, thresh_proba)) 

Confusion Matrix:  [[  285    46]
 [ 5534 16191]]
Accuracy: 74.70076169749727
Recall Score:  0.7452704257767548
classification report
              precision    recall  f1-score   support

           0       0.05      0.86      0.09       331
           1       1.00      0.75      0.85     21725

   micro avg       0.75      0.75      0.75     22056
   macro avg       0.52      0.80      0.47     22056
weighted avg       0.98      0.75      0.84     22056



In [346]:
a=zip(features,lgbm.feature_importances_)
a=list(a)
srt=sorted(a, key = lambda x: x[1])
srt

[('fk_customer_map_id_hist', 363),
 ('max_invalid_amount', 433),
 ('invalid_propensity', 454),
 ('min_invalid_amount', 564),
 ('b_value', 614),
 ('ar_reason_code_hist', 812),
 ('ref_num15_fk_customer_map_id_history', 972),
 ('original_dispute_amount', 1333)]

## XGBoost

In [352]:
from xgboost.sklearn import XGBClassifier

In [353]:
xgb=XGBClassifier(learning_rate =0.05,n_estimators=200,max_depth=4,gamma=0,nthread=5,scale_pos_weight=0.19,seed=27)
xgb.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=5, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=0.19,
       seed=27, silent=None, subsample=1, verbosity=1)

In [354]:
y_pred1_xgb=xgb.predict(X_test[features])
print("Confusion Matrix: ",confusion_matrix(Y_test, y_pred1_xgb)) 
print("Accuracy using random forest: ",accuracy_score(Y_test,y_pred1_xgb)*100)
print("Recall Score using random forest: ", recall_score(Y_test, y_pred1_xgb)) 
print("classification report")
print(classification_report(Y_test, y_pred1_xgb)) 

Confusion Matrix:  [[  185   146]
 [ 1368 20357]]
Accuracy using random forest:  93.13565469713456
Recall Score using random forest:  0.9370310701956271
classification report
              precision    recall  f1-score   support

           0       0.12      0.56      0.20       331
           1       0.99      0.94      0.96     21725

   micro avg       0.93      0.93      0.93     22056
   macro avg       0.56      0.75      0.58     22056
weighted avg       0.98      0.93      0.95     22056



In [355]:
def proba_(pred_pro1,x_test1,y_test1):  
    y_t_lgbm=[None]*pred_pro1.shape[0]
    for i in range(0,pred_pro1.shape[0]):
        if(pred_pro1[i][1]>=0.9 and x_test1.iloc[i]['original_dispute_amount']<=10000):
                y_t_lgbm[i]=1
        else:
               y_t_lgbm[i]=0
    return y_t_lgbm

In [356]:
l=xgb.predict_proba(X_test[features])

In [357]:
thresh_proba=proba_(l,X_test,Y_test)

In [358]:
print("Confusion Matrix: ",confusion_matrix(Y_test, thresh_proba)) 
print ("Accuracy:",accuracy_score(Y_test,thresh_proba)*100) 
print("Recall Score: ", recall_score(Y_test, thresh_proba)) 
print("classification report")
print(classification_report(Y_test, thresh_proba)) 

Confusion Matrix:  [[  305    26]
 [ 6563 15162]]
Accuracy: 70.12604280014509
Recall Score:  0.6979056386651323
classification report
              precision    recall  f1-score   support

           0       0.04      0.92      0.08       331
           1       1.00      0.70      0.82     21725

   micro avg       0.70      0.70      0.70     22056
   macro avg       0.52      0.81      0.45     22056
weighted avg       0.98      0.70      0.81     22056



## Decision Tree

In [347]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='entropy',max_depth=5,class_weight={0:3,1:1}, random_state=8, max_features = 0.7,
                         min_samples_split = 400)
dt.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight={0: 3, 1: 1}, criterion='entropy',
            max_depth=5, max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=400,
            min_weight_fraction_leaf=0.0, presort=False, random_state=8,
            splitter='best')

In [348]:
y_pred1_dt=dt.predict(X_test[features])
print("Confusion Matrix: ",confusion_matrix(Y_test, y_pred1_dt)) 
print("Accuracy using random forest: ",accuracy_score(Y_test,y_pred1_dt)*100)
print("Recall Score using random forest: ", recall_score(Y_test, y_pred1_dt)) 
print("classification report")
print(classification_report(Y_test, y_pred1_dt)) 

Confusion Matrix:  [[  188   143]
 [ 1568 20157]]
Accuracy using random forest:  92.24247370330069
Recall Score using random forest:  0.927825086306099
classification report
              precision    recall  f1-score   support

           0       0.11      0.57      0.18       331
           1       0.99      0.93      0.96     21725

   micro avg       0.92      0.92      0.92     22056
   macro avg       0.55      0.75      0.57     22056
weighted avg       0.98      0.92      0.95     22056



In [349]:
l=dt.predict_proba(X_test[features])

In [350]:
thresh_proba=proba_(l,X_test,Y_test)

In [351]:
print("Confusion Matrix: ",confusion_matrix(Y_test, thresh_proba)) 
print ("Accuracy:",accuracy_score(Y_test,thresh_proba)*100) 
print("Recall Score: ", recall_score(Y_test, thresh_proba)) 
print("classification report")
print(classification_report(Y_test, thresh_proba)) 

Confusion Matrix:  [[  303    28]
 [ 6295 15430]]
Accuracy: 71.33206383750453
Recall Score:  0.7102416570771001
classification report
              precision    recall  f1-score   support

           0       0.05      0.92      0.09       331
           1       1.00      0.71      0.83     21725

   micro avg       0.71      0.71      0.71     22056
   macro avg       0.52      0.81      0.46     22056
weighted avg       0.98      0.71      0.82     22056



## VC

In [382]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[
        ('lr', lgbm), ('rf', model), ('dt', dt), ('xgb', xgb)],
       voting='soft', weights=[1,2,2,1],flatten_transform=True)
vc.fit(X_train,Y_train)

VotingClassifier(estimators=[('lr', LGBMClassifier(boosting_type='gbdt', class_weight={0: 3, 1: 1},
        colsample_bytree=1.0, importance_type='split', learning_rate=0.07,
        max_depth=5, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=200, n_jobs=-1, num_leaves=31,
   ...pha=0, reg_lambda=1, scale_pos_weight=0.19,
       seed=27, silent=None, subsample=1, verbosity=1))],
         flatten_transform=True, n_jobs=None, voting='soft',
         weights=[1, 2, 2, 1])

In [383]:
y_pred1_vc=vc.predict(X_test[features])
print("Confusion Matrix: ",confusion_matrix(Y_test, y_pred1_vc)) 
print("Accuracy using random forest: ",accuracy_score(Y_test,y_pred1_vc)*100)
print("Recall Score using random forest: ", recall_score(Y_test, y_pred1_vc)) 
print("classification report")
print(classification_report(Y_test, y_pred1_vc)) 

Confusion Matrix:  [[  139   192]
 [  739 20986]]
Accuracy using random forest:  95.77892636924193
Recall Score using random forest:  0.9659838895281934
classification report
              precision    recall  f1-score   support

           0       0.16      0.42      0.23       331
           1       0.99      0.97      0.98     21725

   micro avg       0.96      0.96      0.96     22056
   macro avg       0.57      0.69      0.60     22056
weighted avg       0.98      0.96      0.97     22056



In [384]:
l_vc=vc.predict_proba(X_test[features])
thresh_proba=proba_(l_vc,X_test,Y_test)

In [385]:
print("Confusion Matrix: ",confusion_matrix(Y_test, thresh_proba)) 
print ("Accuracy:",accuracy_score(Y_test,thresh_proba)*100) 
print("Recall Score: ", recall_score(Y_test, thresh_proba)) 
print("classification report")
print(classification_report(Y_test, thresh_proba)) 

Confusion Matrix:  [[  307    24]
 [ 6205 15520]]
Accuracy: 71.7582517228872
Recall Score:  0.7143843498273879
classification report
              precision    recall  f1-score   support

           0       0.05      0.93      0.09       331
           1       1.00      0.71      0.83     21725

   micro avg       0.72      0.72      0.72     22056
   macro avg       0.52      0.82      0.46     22056
weighted avg       0.98      0.72      0.82     22056



In [376]:
test_set['predictions']=thresh_proba

In [377]:
test_set[(test_set['is_valid']==0) & (test_set['predictions']==1)]['original_dispute_amount'].sum()/test_set['original_dispute_amount'].sum()  

0.0001571131158982515

In [378]:
test_set[(test_set['is_valid']==0) & (test_set['predictions']==1)]['original_dispute_amount'].count()/test_set['original_dispute_amount'].count()  

0.0011334784185709104

In [379]:
test_set[(test_set['is_valid']==1) & (test_set['predictions']==1)]['original_dispute_amount'].count()/test_set['original_dispute_amount'].count()

0.7054769677185346

In [380]:
test_set[(test_set['is_valid']==1) & (test_set['predictions']==1)]['original_dispute_amount'].sum()/test_set['original_dispute_amount'].sum()

0.102931052283051

In [381]:
test_set[(test_set['is_valid']==1) & (test_set['predictions']==0)]['original_dispute_amount'].count()/test_set['original_dispute_amount'].count()

0.2795157780195865