In [1]:
import pandas as pd
from joblib import dump, load
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import NearMiss
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV

In [2]:
def synthetic_minority_over_sampling(output, all_columns_train):
    print('Original dataset shape {}'.format(Counter(output)))
    sm = SMOTE(random_state=20)
    train_input_new, train_output_new = sm.fit_resample(all_columns_train, output)
    print('New dataset shape {}'.format(Counter(train_output_new)))
    return train_input_new, train_output_new

In [8]:
def calc_prevalence(y_actual):
    # function to calculate if there is class impalance and prints it
    #
    # input : the output variable
    # 
    print(y_actual.value_counts()/len(y_actual))
    print(y_actual.value_counts())

In [39]:
all_columns_train = np.load('all_columns_train.npy')
all_columns_test = np.load('all_columns_test.npy')
output = pd.read_pickle('train_output.pkl')

In [40]:
all_columns_train.shape

(47249, 142)

In [41]:
train_input_new, train_output_new = synthetic_minority_over_sampling(output, all_columns_train)

Original dataset shape Counter({'NO': 30860, '>30': 12818, '<30': 3571})
New dataset shape Counter({'>30': 30860, 'NO': 30860, '<30': 30860})


In [42]:
features_train,features_valid,labels_train,labels_valid = train_test_split(train_input_new,train_output_new,test_size = 0.2,train_size=0.8)

In [9]:
calc_prevalence(labels_train)

NO     0.333936
<30    0.333189
>30    0.332875
Name: readmitted, dtype: float64
NO     27705
<30    27643
>30    27617
Name: readmitted, dtype: int64


In [10]:
calc_prevalence(labels_valid)

>30    0.335165
<30    0.333912
NO     0.330923
Name: readmitted, dtype: float64
>30    6952
<30    6926
NO     6864
Name: readmitted, dtype: int64


In [30]:
def print_metrics_and_return_f1_score(model, features, labels):
    predictions = model.predict(features)
    f1 = f1_score(labels, predictions, average='micro')
    print("f1 is {0:.5f}".format(f1))
    print("Precision is {0:.5f}".format(precision_score(labels, predictions, average='micro')))
    print("Recall is {0:.5f}".format(recall_score(labels, predictions, average='micro')))
    y_true = label_binarize(labels, classes=np.unique(labels))
    y_predict = label_binarize(predictions, classes=np.unique(labels))
    print("AUC is {0:.5f}".format(roc_auc_score(y_true, y_predict,average='micro')))
    return f1

In [45]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X=features_train,y =labels_train)
print_metrics(lr_model, features_train, labels_train)
print_metrics(lr_model, features_valid, labels_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


f1 is 0.47981
Precision is 0.47981
Recall is 0.47981
AUC is 0.60986
f1 is 0.48040
Precision is 0.48040
Recall is 0.48040
AUC is 0.61030


## Random forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train,labels_train)
print_metrics(rf_model, features_train, labels_train)
print_metrics(rf_model, features_valid, labels_valid)

f1 is 1.00000
Precision is 1.00000
Recall is 1.00000
AUC is 1.00000
f1 is 0.81200
Precision is 0.81200
Recall is 0.81200
AUC is 0.85900


In [26]:
max_f1 = 0

for min_samples_leaf in [3, 4, 5]:
    for n_estimators in [100, 200, 300]:        
        gs_model = RandomForestClassifier(min_samples_leaf = min_samples_leaf, n_estimators = n_estimators)
        gs_model.fit(features_train,labels_train)
        print('min_samples_leaf: ', min_samples_leaf)
        print('n_estimators: ', n_estimators)
        f1 = print_metrics_and_return_f1_score(gs_model, features_valid, labels_valid)
        if f1 > max_f1:
            max_f1 = f1
            best_model = gs_model
            best_min_samples_leaf = min_samples_leaf
            best_n_estimators = n_estimators
        print('=============================')

min_samples_leaf:  3
n_estimators:  100
f1 is 0.77591
Precision is 0.77591
Recall is 0.77591
AUC is 0.83194
min_samples_leaf:  3
n_estimators:  200
f1 is 0.77857
Precision is 0.77857
Recall is 0.77857
AUC is 0.83392
min_samples_leaf:  3
n_estimators:  300
f1 is 0.78049
Precision is 0.78049
Recall is 0.78049
AUC is 0.83537
min_samples_leaf:  4
n_estimators:  100
f1 is 0.76859
Precision is 0.76859
Recall is 0.76859
AUC is 0.82644
min_samples_leaf:  4
n_estimators:  200
f1 is 0.76772
Precision is 0.76772
Recall is 0.76772
AUC is 0.82579
min_samples_leaf:  4
n_estimators:  300
f1 is 0.77095
Precision is 0.77095
Recall is 0.77095
AUC is 0.82821
min_samples_leaf:  5
n_estimators:  100
f1 is 0.76126
Precision is 0.76126
Recall is 0.76126
AUC is 0.82094
min_samples_leaf:  5
n_estimators:  200
f1 is 0.76159
Precision is 0.76159
Recall is 0.76159
AUC is 0.82120
min_samples_leaf:  5
n_estimators:  300
f1 is 0.76323
Precision is 0.76323
Recall is 0.76323
AUC is 0.82243


## gradient boosting 

In [51]:
from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(random_state=42)
gbc.fit(features_train,labels_train)
print_metrics(gbc, features_train, labels_train)
print_metrics(gbc, features_valid, labels_valid)

f1 is 0.67768
Precision is 0.67768
Recall is 0.67768
AUC is 0.75826
f1 is 0.66899
Precision is 0.66899
Recall is 0.66899
AUC is 0.75174


## Generating submission file

In [50]:
pred = rf_model.predict(all_columns_test)
test_data = pd.read_csv("test.csv")
test_data['readmitted'] = pred
test_data[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)