In [63]:
import pandas as pd
from joblib import dump, load
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import NearMiss
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [47]:
def synthetic_minority_over_sampling(output, all_columns_train):
    print('Original dataset shape {}'.format(Counter(output)))
    sm = SMOTE(random_state=20)
    train_input_new, train_output_new = sm.fit_resample(all_columns_train, output)
    print('New dataset shape {}'.format(Counter(train_output_new)))
    return train_input_new, train_output_new

In [48]:
def calc_prevalence(y_actual):
    # function to calculate if there is class impalance and prints it
    #
    # input : the output variable
    # 
    print(y_actual.value_counts()/len(y_actual))
    print(y_actual.value_counts())

In [67]:
np.random.seed(0)
all_columns_train = np.load('all_columns_train.npy')
all_columns_test = np.load('all_columns_test.npy')
output = pd.read_pickle('train_output.pkl')

In [68]:
all_columns_train.shape

(71236, 60)

In [69]:
features_train,features_valid,labels_train,labels_valid = train_test_split(all_columns_train,output,train_size=0.9,
                                                                           stratify = output
                                                                          )

In [52]:
#features_train, labels_train = synthetic_minority_over_sampling(labels_train, features_train)

In [53]:
calc_prevalence(labels_train)

NO     0.539131
>30    0.349267
<30    0.111602
Name: readmitted, dtype: float64
NO     30724
>30    19904
<30     6360
Name: readmitted, dtype: int64


In [54]:
calc_prevalence(labels_valid)

NO     0.539093
>30    0.349312
<30    0.111595
Name: readmitted, dtype: float64
NO     7681
>30    4977
<30    1590
Name: readmitted, dtype: int64


In [55]:
def print_metrics_and_return_f1_score(model, features, labels):
    predictions = model.predict(features)
    f1 = f1_score(labels, predictions, average='micro')
    print("f1 is {0:.5f}".format(f1))
    #print("Precision is {0:.5f}".format(precision_score(labels, predictions, average='micro')))
    #print("Recall is {0:.5f}".format(recall_score(labels, predictions, average='micro')))
    #y_true = label_binarize(labels, classes=np.unique(labels))
    #y_predict = label_binarize(predictions, classes=np.unique(labels))
    #print("AUC is {0:.5f}".format(roc_auc_score(y_true, y_predict,average='micro')))
    return f1

In [70]:
from sklearn.linear_model import SGDClassifier
loss = ['hinge', 'log_loss']
penalty = ['l2', 'l1', 'elasticnet']
alpha_values = [10, 1.0, 0.1, 0.01]
grid = dict(penalty=penalty,alpha=alpha_values, loss=loss)
lr_model = SGDClassifier(random_state=0)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
grid_search = GridSearchCV(estimator=lr_model, param_grid=grid, n_jobs=-1, cv=cv, verbose=1)
grid_result = grid_search.fit(features_train, labels_train)

print_metrics_and_return_f1_score(grid_result.best_estimator_, features_train, labels_train)
print_metrics_and_return_f1_score(grid_result.best_estimator_, features_valid, labels_valid)
print(grid_result.best_params_)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
f1 is 0.57697
f1 is 0.57650
{'alpha': 0.01, 'loss': 'log_loss', 'penalty': 'l2'}


## Random forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = [10, 100, 500]
max_features = ['sqrt', 'log2']
min_samples_split = [10, 20, 5]
grid = dict(n_estimators=n_estimators,max_features=max_features, min_samples_split=min_samples_split)
rf_model = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(estimator=rf_model, param_grid=grid, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(features_train, labels_train)

print_metrics_and_return_f1_score(grid_result.best_estimator_, features_train, labels_train)
print_metrics_and_return_f1_score(grid_result.best_estimator_, features_valid, labels_valid)
print(grid_result.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




f1 is 0.75956
f1 is 0.58478
{'max_features': 'sqrt', 'min_samples_split': 20, 'n_estimators': 500}


In [72]:
from sklearn.ensemble import AdaBoostClassifier
adamodel = AdaBoostClassifier(random_state=0)
adamodel.fit(features_train,labels_train)
print_metrics_and_return_f1_score(adamodel, features_valid, labels_valid)

f1 is 0.57748


0.5774845592363841

In [73]:
from xgboost import XGBClassifier
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels_train = le.fit_transform(labels_train)
labels_valid = le.transform(labels_valid)
xgb_model = XGBClassifier(max_depth=4, random_state=0)
xgb_model.fit(features_train,labels_train)
xgb_model.score(features_valid,labels_valid)

0.5854856822010107

## Generating submission file

In [60]:
pred = le.inverse_transform(xgb_model.predict(all_columns_test))
test_data = pd.read_csv("test.csv")
test_data['readmitted'] = pred
test_data[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)

In [61]:
pred = rf_model.predict(all_columns_test)
test_data = pd.read_csv("test.csv")
test_data['readmitted'] = pred
test_data[['encounter_id', 'readmitted']].to_csv('submission.csv', index=False)