## Goal in this notebook:

1. Apply machine learning algorithms onto the regular, non-resampled dataset.
2. Apply the machine learning algorithms on different features chosen from selectKbest
3. Evaluate metrics such as precision, recall, and accuracy
4. Repeat for the resampled data

In [62]:
### Reading the files
import pandas as pd
import numpy as np
X_train = pd.read_csv("X_train", delimiter = ",")
X_test = pd.read_csv("X_test", delimiter = ",")
t_train = pd.read_csv("t_train", delimiter = ",")
t_test = pd.read_csv("t_test",  delimiter = ",")
X_train_rs = pd.read_csv("X_train_rs", delimiter = ",")
t_train_rs = pd.read_csv("t_train_rs", delimiter = ",")

In [63]:
### Selected Features for K = 8
k_eight_features = ['patient_nbr', 'time_in_hospital', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'diabetesMed']
k_sixteen_features = ['patient_nbr', 'race', 'age', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_3', 'number_diagnoses', 'metformin', 'change', 'diabetesMed']
k_thirty_two_features = ['patient_nbr', 'race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'metformin', 'repaglinide', 'nateglinide', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'glipizide-metformin', 'glimepiride-pioglitazone', 'change', 'diabetesMed']
feature_selection = [k_eight_features, k_sixteen_features, k_thirty_two_features]

In [64]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

## Training and Testing Keeping All Features:

In [65]:
t_train.value_counts()

readmitted
0             49431
1             42158
dtype: int64

In [66]:
t_test.value_counts()

readmitted
0             5433
1             4744
dtype: int64

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn import metrics

lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors = 3)
dt = DecisionTreeClassifier(max_depth = 15)
rf = RandomForestClassifier(n_estimators = 100, max_depth = 15)
nb = GaussianNB()
models = [lr, dt, rf, nb, knn]
name = ['lr', 'dt', 'rf', 'nb', 'knn']

k = 0
for model in models:
    pipe = Pipeline([('scaler', StandardScaler()), (name[0], model)])
    pipe.fit(X_train, t_train)
    t_pred = pipe.predict(X_test)
    print(name[k])
    print("Training Score: ", pipe.score(X_train, t_train))
    print("Testing Score: ", pipe.score(X_test, t_test))
    print(classification_report(t_test, t_pred))
    fpr, tpr, thresholds = metrics.roc_curve(t_test, t_pred, pos_label=1)
    print("AUC ", metrics.auc(fpr, tpr))
    k += 1

lr
Training Score:  0.6158818198691983
Testing Score:  0.621008155645082
              precision    recall  f1-score   support

           0       0.62      0.77      0.69      5433
           1       0.63      0.44      0.52      4744

    accuracy                           0.62     10177
   macro avg       0.62      0.61      0.60     10177
weighted avg       0.62      0.62      0.61     10177

AUC  0.6098332546498523
dt
Training Score:  0.7427092773149614
Testing Score:  0.612262945858308
              precision    recall  f1-score   support

           0       0.63      0.66      0.65      5433
           1       0.59      0.55      0.57      4744

    accuracy                           0.61     10177
   macro avg       0.61      0.61      0.61     10177
weighted avg       0.61      0.61      0.61     10177

AUC  0.6085127456375674
rf
Training Score:  0.7700815600126653
Testing Score:  0.6469490026530411
              precision    recall  f1-score   support

           0       0.65

## Training and Testing Using SelectKBestFeatures:

In [68]:
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors = 3)
#svm = SVC(kernel = 'linear')
dt = DecisionTreeClassifier(max_depth = 15)
rf = RandomForestClassifier(n_estimators = 100, max_depth = 15)
nb = GaussianNB()
models = [lr, dt, rf, nb, knn]
name = ['lr', 'dt', 'rf', 'nb', 'knn']
feature_names = ['K = 8', 'K = 16', 'K = 32']
i = 0
for feature in feature_selection:
    print(feature_names[i])
    k = 0
    i += 1
    for model in models:
        pipe = Pipeline([('scaler', StandardScaler()), (name[k], model)])
        pipe.fit(X_train[feature], t_train)
        t_pred = pipe.predict(X_test[feature])
        print(name[k])
        print("Training Score: ", pipe.score(X_train[feature], t_train))
        print("Testing Score: ", pipe.score(X_test[feature], t_test))
        print(classification_report(t_test, t_pred))
        fpr, tpr, thresholds = metrics.roc_curve(t_test, t_pred, pos_label=1)
        print(metrics.auc(fpr, tpr))
        k += 1

K = 8
lr
Training Score:  0.6124207055432421
Testing Score:  0.6154072909501818
              precision    recall  f1-score   support

           0       0.61      0.78      0.68      5433
           1       0.63      0.43      0.51      4744

    accuracy                           0.62     10177
   macro avg       0.62      0.60      0.60     10177
weighted avg       0.62      0.62      0.60     10177

0.6033444863675824
dt
Training Score:  0.7019511076657677
Testing Score:  0.6087255576299498
              precision    recall  f1-score   support

           0       0.63      0.67      0.65      5433
           1       0.59      0.54      0.56      4744

    accuracy                           0.61     10177
   macro avg       0.61      0.60      0.60     10177
weighted avg       0.61      0.61      0.61     10177

0.6045580859459508
rf
Training Score:  0.733548788609986
Testing Score:  0.6190429399626609
              precision    recall  f1-score   support

           0       0.63   

### Model Evaluation Using the Resampled Data

In [69]:
print("------- KEEPING ALL FEATURES -------")
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors = 3)
#svm = SVC(kernel = 'linear')
dt = DecisionTreeClassifier(max_depth = 15)
rf = RandomForestClassifier(n_estimators = 100, max_depth = 15)
nb = GaussianNB()
models = [lr, dt, rf, nb, knn]
name = ['lr', 'dt', 'rf', 'nb', 'knn']

k = 0
for model in models:
    pipe = Pipeline([('scaler', StandardScaler()), (name[k], model)])
    pipe.fit(X_train_rs, t_train_rs)
    t_pred = pipe.predict(X_test)
    print(name[k])
    print("Training Score: ", pipe.score(X_train_rs, t_train_rs))
    print("Testing Score: ", pipe.score(X_test, t_test))
    fpr, tpr, thresholds = metrics.roc_curve(t_test, t_pred, pos_label=1)
    print(classification_report(t_test, t_pred))
    print(metrics.auc(fpr, tpr))
    k += 1


lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors = 3)
#svm = SVC(kernel = 'linear')
dt = DecisionTreeClassifier(max_depth = 15)
rf = RandomForestClassifier(n_estimators = 100, max_depth = 15)
nb = GaussianNB()
models = [lr, dt, rf, nb, knn]
name = ['lr', 'dt', 'rf', 'nb', 'knn']
feature_names = ['K = 8', 'K = 16', 'K = 32']
i = 0
for feature in feature_selection:
    print(feature_names[i])
    k = 0
    i += 1
    for model in models:
        pipe = Pipeline([('scaler', StandardScaler()), (name[k], model)])
        pipe.fit(X_train_rs[feature], t_train_rs)
        t_pred = pipe.predict(X_test[feature])
        print(name[k])
        print("Training Score: ", pipe.score(X_train_rs[feature], t_train_rs))
        print("Testing Score: ", pipe.score(X_test[feature], t_test))
        print(classification_report(t_test, t_pred))
        fpr, tpr, thresholds = metrics.roc_curve(t_test, t_pred, pos_label=1)
        print(metrics.auc(fpr, tpr))
        k += 1

------- KEEPING ALL FEATURES -------
lr
Training Score:  0.6150998361352188
Testing Score:  0.6107890340964921
              precision    recall  f1-score   support

           0       0.63      0.67      0.65      5433
           1       0.59      0.54      0.57      4744

    accuracy                           0.61     10177
   macro avg       0.61      0.61      0.61     10177
weighted avg       0.61      0.61      0.61     10177

0.6065174520581705
dt
Training Score:  0.7382917602314337
Testing Score:  0.615702073302545
              precision    recall  f1-score   support

           0       0.64      0.64      0.64      5433
           1       0.59      0.58      0.59      4744

    accuracy                           0.62     10177
   macro avg       0.61      0.61      0.61     10177
weighted avg       0.62      0.62      0.62     10177

0.6136986194540949
rf
Training Score:  0.7721065727984463
Testing Score:  0.6498968261766729
              precision    recall  f1-score   supp

### Does oversampling hurt or improve our model?

In general, it appears that SMOTE resampling doesn't really make any improvements to our model.

Still, the random forest classifier seems to be the best. In the next notebook, I will focus on tuning random forest parameters in order to improve our AUC curve