In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from important_features import select_feats

Import the dataset

In [2]:
data = pd.read_csv('training_v2.csv')
X = data.drop('hospital_death', axis=1)
y = data.hospital_death

Create a dataset with important features determined through random sampling

In [3]:
X = X[select_feats]

Clean the data
- Fill missing values with -1
- Label encode categorical variables

In [5]:
### Preprocessing

#fill NaNs
train_fill = X.fillna(-1)

#column stuff
cols = X.columns
num_cols = X._get_numeric_data().columns # numeric
cat_cols = list(set(cols) - set(num_cols)) # categorical
train_fill[cat_cols] = train_fill[cat_cols].astype('str')

# label encoding for categorical features
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    le = LabelEncoder().fit(
            np.unique(train_fill[col].unique().tolist()))
    train_fill[col] = le.transform(train_fill[col])+1

Create the XGboost model with hyperparametes selected through gridsearch

In [6]:
# XGB model
xgb = XGBClassifier(colsample_bytree=0.3, eta=0.02, gamma = 0.1, max_depth = 6, min_child_weight = 6)

Train and test the model

In [7]:
# Split the training data
X_train, X_test, y_train, y_test = train_test_split(train_fill, y, stratify = y, 
                                        test_size=0.2, random_state=42)
# fit the model
xgb.fit(X_train, y_train)
preds = xgb.predict_proba(X_test)

# predict auc score
roc_auc_score(y_test, preds[:,1:])

0.9077115217322473