In [1]:
import pandas as pd
from joblib import dump, load
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import NearMiss
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV

In [2]:
def synthetic_minority_over_sampling(output, all_columns_train):
    print('Original dataset shape {}'.format(Counter(output)))
    sm = SMOTE(random_state=20)
    train_input_new, train_output_new = sm.fit_resample(all_columns_train, output)
    print('New dataset shape {}'.format(Counter(train_output_new)))
    return train_input_new, train_output_new

In [3]:
def calc_prevalence(y_actual):
    # function to calculate if there is class impalance and prints it
    #
    # input : the output variable
    # 
    print(y_actual.value_counts()/len(y_actual))
    print(y_actual.value_counts())

In [43]:
all_columns_train = np.load('all_columns_train.npy')
all_columns_test = np.load('all_columns_test.npy')
output = pd.read_pickle('train_output.pkl')

In [44]:
all_columns_train.shape

(50362, 43)

In [64]:
features_train,features_valid,labels_train,labels_valid = train_test_split(all_columns_train,output,test_size = 0.2,train_size=0.8,
                                                                           stratify = output
                                                                          )

In [65]:
#features_train, labels_train = synthetic_minority_over_sampling(labels_train, features_train)

In [66]:
calc_prevalence(labels_train)

NO     0.641366
>30    0.280002
<30    0.078632
Name: readmitted, dtype: float64
NO     25840
>30    11281
<30     3168
Name: readmitted, dtype: int64


In [67]:
calc_prevalence(labels_valid)

NO     0.641318
>30    0.280056
<30    0.078626
Name: readmitted, dtype: float64
NO     6460
>30    2821
<30     792
Name: readmitted, dtype: int64


In [17]:
def print_metrics_and_return_f1_score(model, features, labels):
    predictions = model.predict(features)
    f1 = f1_score(labels, predictions, average='micro')
    print("f1 is {0:.5f}".format(f1))
    print("Precision is {0:.5f}".format(precision_score(labels, predictions, average='micro')))
    print("Recall is {0:.5f}".format(recall_score(labels, predictions, average='micro')))
    y_true = label_binarize(labels, classes=np.unique(labels))
    y_predict = label_binarize(predictions, classes=np.unique(labels))
    print("AUC is {0:.5f}".format(roc_auc_score(y_true, y_predict,average='micro')))
    return f1

In [68]:
from sklearn.linear_model import SGDClassifier
lr_model = SGDClassifier()
lr_model.fit(X=features_train,y =labels_train)
print_metrics_and_return_f1_score(lr_model, features_train, labels_train)
print_metrics_and_return_f1_score(lr_model, features_valid, labels_valid)

f1 is 0.64129
Precision is 0.64129
Recall is 0.64129
AUC is 0.73097
f1 is 0.64092
Precision is 0.64092
Recall is 0.64092
AUC is 0.73069


0.6409212746947285

## Random forest

In [69]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight = 'balanced')
rf_model.fit(features_train,labels_train)
print_metrics_and_return_f1_score(rf_model, features_train, labels_train)
print_metrics_and_return_f1_score(rf_model, features_valid, labels_valid)

f1 is 0.99993
Precision is 0.99993
Recall is 0.99993
AUC is 0.99994
f1 is 0.63923
Precision is 0.63923
Recall is 0.63923
AUC is 0.72943


0.6392335947582647

In [70]:
max_f1 = 0

for min_samples_leaf in [3, 4, 5]:
    for n_estimators in [100, 200, 300]:        
        gs_model = RandomForestClassifier(min_samples_leaf = min_samples_leaf, n_estimators = n_estimators)
        gs_model.fit(features_train,labels_train)
        print('min_samples_leaf: ', min_samples_leaf)
        print('n_estimators: ', n_estimators)
        f1 = print_metrics_and_return_f1_score(gs_model, features_valid, labels_valid)
        if f1 > max_f1:
            max_f1 = f1
            best_model = gs_model
            best_min_samples_leaf = min_samples_leaf
            best_n_estimators = n_estimators
        print('=============================')

min_samples_leaf:  3
n_estimators:  100
f1 is 0.64847
Precision is 0.64847
Recall is 0.64847
AUC is 0.73635
min_samples_leaf:  3
n_estimators:  200
f1 is 0.64916
Precision is 0.64916
Recall is 0.64916
AUC is 0.73687
min_samples_leaf:  3
n_estimators:  300
f1 is 0.64767
Precision is 0.64767
Recall is 0.64767
AUC is 0.73575
min_samples_leaf:  4
n_estimators:  100
f1 is 0.64837
Precision is 0.64837
Recall is 0.64837
AUC is 0.73628
min_samples_leaf:  4
n_estimators:  200
f1 is 0.64747
Precision is 0.64747
Recall is 0.64747
AUC is 0.73561
min_samples_leaf:  4
n_estimators:  300
f1 is 0.64817
Precision is 0.64817
Recall is 0.64817
AUC is 0.73613
min_samples_leaf:  5
n_estimators:  100
f1 is 0.64737
Precision is 0.64737
Recall is 0.64737
AUC is 0.73553
min_samples_leaf:  5
n_estimators:  200
f1 is 0.64757
Precision is 0.64757
Recall is 0.64757
AUC is 0.73568
min_samples_leaf:  5
n_estimators:  300
f1 is 0.64906
Precision is 0.64906
Recall is 0.64906
AUC is 0.73680


## gradient boosting 

In [14]:
"""from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(random_state=42)
gbc.fit(features_train,labels_train)
print_metrics(gbc, features_train, labels_train)
print_metrics(gbc, features_valid, labels_valid)
"""

'from sklearn.ensemble import GradientBoostingClassifier\ngbc =GradientBoostingClassifier(random_state=42)\ngbc.fit(features_train,labels_train)\nprint_metrics(gbc, features_train, labels_train)\nprint_metrics(gbc, features_valid, labels_valid)\n'

## Generating submission file

In [70]:
pred = rf_model.predict(all_columns_test)
test_data = pd.read_csv("test.csv")
test_data['readmitted'] = pred
test_data[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)