In [1]:
import pandas as pd
from joblib import dump, load
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import NearMiss
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV

In [2]:
def synthetic_minority_over_sampling(output, all_columns_train):
    print('Original dataset shape {}'.format(Counter(output)))
    sm = SMOTE(random_state=20)
    train_input_new, train_output_new = sm.fit_resample(all_columns_train, output)
    print('New dataset shape {}'.format(Counter(train_output_new)))
    return train_input_new, train_output_new

In [3]:
def calc_prevalence(y_actual):
    # function to calculate if there is class impalance and prints it
    #
    # input : the output variable
    # 
    print(y_actual.value_counts()/len(y_actual))
    print(y_actual.value_counts())

In [4]:
all_columns_train = np.load('all_columns_train.npy')
all_columns_test = np.load('all_columns_test.npy')
output = pd.read_pickle('train_output.pkl')

In [5]:
all_columns_train.shape

(47249, 107)

In [6]:
train_input_new, train_output_new = synthetic_minority_over_sampling(output, all_columns_train)

Original dataset shape Counter({'NO': 30860, '>30': 12818, '<30': 3571})
New dataset shape Counter({'>30': 30860, 'NO': 30860, '<30': 30860})


In [7]:
features_train,features_valid,labels_train,labels_valid = train_test_split(train_input_new,train_output_new,test_size = 0.2,train_size=0.8)

In [8]:
calc_prevalence(labels_train)

<30    0.334454
>30    0.333279
NO     0.332267
Name: readmitted, dtype: float64
<30    24771
>30    24684
NO     24609
Name: readmitted, dtype: int64


In [9]:
calc_prevalence(labels_valid)

NO     0.337600
>30    0.333549
<30    0.328851
Name: readmitted, dtype: float64
NO     6251
>30    6176
<30    6089
Name: readmitted, dtype: int64


In [10]:
def print_metrics_and_return_f1_score(model, features, labels):
    predictions = model.predict(features)
    f1 = f1_score(labels, predictions, average='micro')
    print("f1 is {0:.5f}".format(f1))
    print("Precision is {0:.5f}".format(precision_score(labels, predictions, average='micro')))
    print("Recall is {0:.5f}".format(recall_score(labels, predictions, average='micro')))
    y_true = label_binarize(labels, classes=np.unique(labels))
    y_predict = label_binarize(predictions, classes=np.unique(labels))
    print("AUC is {0:.5f}".format(roc_auc_score(y_true, y_predict,average='micro')))
    return f1

In [11]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X=features_train,y =labels_train)
print_metrics_and_return_f1_score(lr_model, features_train, labels_train)
print_metrics_and_return_f1_score(lr_model, features_valid, labels_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


f1 is 0.45546
Precision is 0.45546
Recall is 0.45546
AUC is 0.59159
f1 is 0.45102
Precision is 0.45102
Recall is 0.45102
AUC is 0.58826


0.45101533808597977

## Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train,labels_train)
print_metrics_and_return_f1_score(rf_model, features_train, labels_train)
print_metrics_and_return_f1_score(rf_model, features_valid, labels_valid)

f1 is 1.00000
Precision is 1.00000
Recall is 1.00000
AUC is 1.00000
f1 is 0.80228
Precision is 0.80228
Recall is 0.80228
AUC is 0.85171


0.8022791099589546

In [13]:
max_f1 = 0

for min_samples_leaf in [3, 4, 5]:
    for n_estimators in [100, 200, 300]:        
        gs_model = RandomForestClassifier(min_samples_leaf = min_samples_leaf, n_estimators = n_estimators)
        gs_model.fit(features_train,labels_train)
        print('min_samples_leaf: ', min_samples_leaf)
        print('n_estimators: ', n_estimators)
        f1 = print_metrics_and_return_f1_score(gs_model, features_valid, labels_valid)
        if f1 > max_f1:
            max_f1 = f1
            best_model = gs_model
            best_min_samples_leaf = min_samples_leaf
            best_n_estimators = n_estimators
        print('=============================')

min_samples_leaf:  3
n_estimators:  100
f1 is 0.78224
Precision is 0.78224
Recall is 0.78224
AUC is 0.83668
min_samples_leaf:  3
n_estimators:  200
f1 is 0.78564
Precision is 0.78564
Recall is 0.78564
AUC is 0.83923
min_samples_leaf:  3
n_estimators:  300
f1 is 0.78602
Precision is 0.78602
Recall is 0.78602
AUC is 0.83952
min_samples_leaf:  4
n_estimators:  100
f1 is 0.77393
Precision is 0.77393
Recall is 0.77393
AUC is 0.83044
min_samples_leaf:  4
n_estimators:  200
f1 is 0.77630
Precision is 0.77630
Recall is 0.77630
AUC is 0.83223
min_samples_leaf:  4
n_estimators:  300
f1 is 0.77668
Precision is 0.77668
Recall is 0.77668
AUC is 0.83251
min_samples_leaf:  5
n_estimators:  100
f1 is 0.76388
Precision is 0.76388
Recall is 0.76388
AUC is 0.82291
min_samples_leaf:  5
n_estimators:  200
f1 is 0.76766
Precision is 0.76766
Recall is 0.76766
AUC is 0.82575
min_samples_leaf:  5
n_estimators:  300
f1 is 0.76771
Precision is 0.76771
Recall is 0.76771
AUC is 0.82579


## gradient boosting 

In [14]:
"""from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(random_state=42)
gbc.fit(features_train,labels_train)
print_metrics(gbc, features_train, labels_train)
print_metrics(gbc, features_valid, labels_valid)
"""

'from sklearn.ensemble import GradientBoostingClassifier\ngbc =GradientBoostingClassifier(random_state=42)\ngbc.fit(features_train,labels_train)\nprint_metrics(gbc, features_train, labels_train)\nprint_metrics(gbc, features_valid, labels_valid)\n'

## Generating submission file

In [15]:
pred = rf_model.predict(all_columns_test)
test_data = pd.read_csv("test.csv")
test_data['readmitted'] = pred
test_data[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)