**Classification Methods II**

This notebook assesses the following classification algorithms:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
##Import data
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [3]:
train = pd.read_csv('train.csv')

In [None]:
train.head()

Unnamed: 0.1,Unnamed: 0,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,dim_pa_id,correct_diagnosis,tried_and_failed,contraindication,pa_approved,dim_date_id
0,106328,2017-04-07,2017,4,7,6,1,1,0,106329,999001,A,0,1,,,,,,
1,31702,2017-01-30,2017,1,30,2,1,1,0,31703,999001,C,0,1,,,,,,
2,1270970,2019-11-11,2019,11,11,2,1,1,0,1270971,417740,B,70,0,528977.0,1.0,1.0,0.0,0.0,1045.0
3,1093403,2019-06-28,2019,6,28,6,1,1,0,1093404,999001,A,76,0,454463.0,1.0,0.0,0.0,1.0,909.0
4,61846,2017-02-27,2017,2,27,2,1,1,0,61847,999001,A,0,1,,,,,,


In [4]:
df = train[['drug', 'bin', 'calendar_month', 
          'calendar_day', 'day_of_week']]
X = pd.get_dummies(df, columns=['drug', 'bin'], drop_first=False)
y = train['pharmacy_claim_approved']

In [5]:
X.head()


Unnamed: 0,calendar_month,calendar_day,day_of_week,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001
0,4,7,6,1,0,0,0,0,0,1
1,1,30,2,0,0,1,0,0,0,1
2,11,11,2,0,1,0,0,0,1,0
3,6,28,6,1,0,0,0,0,0,1
4,2,27,2,1,0,0,0,0,0,1


In [6]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_train, X_val, y_train_train, y_val = train_test_split(X, y,
                                                              test_size=.2,
                                                              random_state=5438,
                                                              shuffle=True,
                                                              stratify=y
                                                              )



*   **SGD Classifier**



In [12]:
## Import stochastic gradient descent
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_classifier = SGDClassifier(max_iter=100, tol=-np.infty)
sgd_classifier.fit(X_train_train, y_train_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=100, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=-inf,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
preds_sgd_val = sgd_classifier.predict(X_val)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [None]:
## Roc Auc
np.round(roc_auc_score(y_val, preds_sgd_val)*100, 2)

81.15

In [None]:
## Accuracy
np.round(100*accuracy_score(y_val, preds_sgd_val),2)

79.92



*   **SGD Classifier in several subsamples**










In [11]:
##Import Kfold
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

In [18]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)
sgd_classifier = SGDClassifier(max_iter=100, tol=-np.infty)

cv_accs = np.zeros((5,1))
cv_aucs = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X, y):
    print(i)
    # this gets the training and holdout sets
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_val = X.iloc[test_index]
    y_val = y.iloc[test_index]

    ## make model clones
    sgd_clone = clone(sgd_classifier)
    sgd_clone.fit(X_train, y_train)

    ## record the performances
    cv_accs[i,0] = accuracy_score(y_val, sgd_clone.predict(X_val))
    cv_aucs[i,0] = roc_auc_score(y_val, sgd_clone.predict(X_val))

    i = i + 1


0
1
2
3
4


In [20]:
## Accuracy
100*np.mean(cv_accs, axis=1)

array([80.18014494, 79.85215151, 79.8197433 , 79.99575341, 79.87059067])

In [21]:
## Roc Auc
100*np.mean(cv_aucs, axis=1) 

array([81.38301134, 81.08656714, 81.05340934, 81.22516192, 81.08069381])