In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

sns.set_theme(style='whitegrid')

In [2]:
data = pd.read_csv('../data/processed_data/processed_data.csv')

### Linear Discriminant Analysis

In [3]:

#importing the model for train test split
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

In [4]:
#making train test split

data_trian, data_test = train_test_split(data,shuffle=True, test_size=0.2,stratify=data.pharmacy_claim_approved , random_state=345)

In [5]:
#importing classification model

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 

#import metric to measure performance of the classification model

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [57]:
#more metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [6]:
#making model object 

lda = LinearDiscriminantAnalysis()

In [7]:
#choosing the feature for classification 


X_train = data_trian[['bin','drug']].copy()

y_train = data_trian['pharmacy_claim_approved'].copy()

In [8]:
#Chaning categorical variables to numerical variables 

X_train['bin_class']=pd.factorize(X_train['bin'])[0]
X_train['drug_class']=pd.factorize(X_train['drug'])[0]

In [9]:
#cross validation of linear discreminant analysis

#make kfold object

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=323) 

#define array to store cross validation results

roc_cvs = np.zeros(5)
acs_cvs = np.zeros(5) 

i=0 
for train_index, test_index in kfold.split(X_train, y_train):
    X_train_train = X_train.iloc[train_index]
    X_holdout = X_train.iloc[test_index]
    y_train_train = y_train.iloc[train_index]
    y_holdout = y_train.iloc[test_index] 
    
    lda_clone = clone(lda)
    lda_clone.fit(X_train_train[['bin_class','drug_class']],y_train_train)
    y_pred = lda_clone.predict(X_holdout[['bin_class','drug_class']])
    y_pred_pro = lda_clone.predict_proba(X_holdout[['bin_class','drug_class']])
    roc_cvs[i] = roc_auc_score(pd.get_dummies(y_holdout),y_pred_pro)
    acs_cvs[i] = accuracy_score(y_holdout,y_pred)
    i=i+1


In [10]:
roc_cvs

array([0.49061966, 0.48977679, 0.49247392, 0.48922582, 0.48740927])

In [11]:
acs_cvs

array([0.58373734, 0.58373734, 0.58373734, 0.58373734, 0.58373734])

In [12]:
np.unique(y_pred)

array([1])

### KNN classification

In [13]:
#import the model

from sklearn.neighbors import KNeighborsClassifier

In [15]:
#make model object

knn = KNeighborsClassifier()

In [58]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=323) 

#define array to store cross validation results

knn_roc_cvs = np.zeros(5)
knn_acs_cvs = np.zeros(5)
knn_prs_cvs = np.zeros(5)

i=0 
for train_index, test_index in kfold.split(X_train, y_train):
    X_train_train = X_train.iloc[train_index]
    X_holdout = X_train.iloc[test_index]
    y_train_train = y_train.iloc[train_index]
    y_holdout = y_train.iloc[test_index] 
    
    knn_clone = clone(knn)
    knn_clone.fit(X_train_train[['bin_class','drug_class']],y_train_train)
    y_pred = knn_clone.predict(X_holdout[['bin_class','drug_class']])
    y_pred_pro = knn_clone.predict_proba(X_holdout[['bin_class','drug_class']])
    knn_roc_cvs[i] = roc_auc_score(pd.get_dummies(y_holdout),y_pred_pro)
    knn_acs_cvs[i] = accuracy_score(y_holdout,y_pred)
    knn_prs_cvs[i] = precision_score(y_holdout,y_pred)
    i=i+1

In [59]:
knn_roc_cvs.mean()

0.9222776027343098

In [60]:
knn_acs_cvs.mean()

0.9353490069820115

In [61]:
knn_prs_cvs.mean()

0.9002897943796848

In [63]:
print('the average cross validation ROC AUC score for KNN classification is', np.round(knn_roc_cvs.mean(),4))
print('')
print('the average cross validation accuracy score for KNN classification is', np.round(knn_acs_cvs.mean(),4))
print('')
print('the average cross validation precision score for KNN classification is', np.round(knn_prs_cvs.mean(),4))

the average cross validation ROC AUC score for KNN classification is 0.9223

the average cross validation accuracy score for KNN classification is 0.9353

the average cross validation precision score for KNN classification is 0.9003


In [22]:
#measuring other metrices for the classification on the last train test split of cross validation

conf_mat = confusion_matrix(y_holdout,y_pred)

prs = precision_score(y_holdout,y_pred) 

recall = recall_score(y_holdout,y_pred)

In [23]:
prs

0.9004612752564445

In [24]:
recall

1.0

### Decision tree classifier

In [27]:
#import the model 

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier

In [64]:
depthes = [1,2,3,4,5,6,7,8,9,10]

dt_acs = np.zeros((5,len(depthes)))
dt_roc_aucs = np.zeros((5,len(depthes)))
dt_prs = np.zeros((5,len(depthes)))

i=0 
for train_index, test_index in kfold.split(X_train, y_train):
    X_train_train = X_train.iloc[train_index]
    X_holdout = X_train.iloc[test_index]
    y_train_train = y_train.iloc[train_index]
    y_holdout = y_train.iloc[test_index] 
    
    j=0
    for depth in depthes:
        
        tree_clf = DecisionTreeClassifier(max_depth=depth) 
        
        tree_clf.fit(X_train_train[['bin_class','drug_class']], y_train_train) 

        y_pred = tree_clf.predict(X_holdout[['bin_class','drug_class']]) 
        y_pred_prob = tree_clf.predict_proba(X_holdout[['bin_class','drug_class']]) 
        dt_acs[i,j]=accuracy_score(y_pred,y_holdout)
        dt_roc_aucs[i,j]=roc_auc_score(pd.get_dummies(y_holdout),y_pred_prob)
        dt_prs[i,j]=precision_score(y_pred,y_holdout)
        j=j+1
    i=i+1
    

In [44]:
np.mean(dt_acs,axis=0)

array([0.7161803 , 0.79987271, 0.8433961 , 0.90874998, 0.93534901,
       0.93534901, 0.93534901, 0.93534901, 0.93534901, 0.93534901])

In [55]:
print('the maximum depth that gives best accuracy score is', depthes[np.argmax(np.mean(dt_acs,axis=0))])
print('')
print('the highest accuracy score is', np.round(np.max(np.mean(dt_acs,axis=0)),4))

the maximum depth that gives best accuracy score is 5

the highest accuracy score is 0.9353


In [51]:
np.mean(dt_roc_aucs,axis=0)

array([0.67591048, 0.85390572, 0.89751601, 0.92063663, 0.92243017,
       0.92243017, 0.92243017, 0.92243017, 0.92243017, 0.92243017])

In [56]:
print('the maximum depth that gives best ROC AUC score is', depthes[np.argmax(np.mean(dt_roc_aucs,axis=0))])
print('')
print('the highest ROC AUC score is', np.round(np.max(np.mean(dt_roc_aucs,axis=0)),4))

the maximum depth that gives best ROC AUC score is 5

the highest ROC AUC score is 0.9224


In [66]:
np.mean(dt_prs,axis=0)

array([0.91636364, 0.73921437, 0.82285073, 0.94873176, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [67]:
print('the maximum depth that gives best precision score is', depthes[np.argmax(np.mean(dt_prs,axis=0))])
print('')
print('the highest precision score is', np.round(np.max(np.mean(dt_prs,axis=0)),4))

the maximum depth that gives best precision score is 5

the highest precision score is 1.0
