In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

## SUPPORT2 dataset

In [2]:
# Load data.
support2_df = pd.read_csv("../Datasets/support2.csv", na_values="?")

In [3]:
support2_df.columns

Index(['age', 'death', 'sex', 'hospdead', 'slos', 'd.time', 'dzgroup',
       'dzclass', 'num.co', 'edu', 'income', 'scoma', 'charges', 'totcst',
       'totmcst', 'avtisst', 'race', 'sps', 'aps', 'surv2m', 'surv6m', 'hday',
       'diabetes', 'dementia', 'ca', 'prg2m', 'prg6m', 'dnr', 'dnrday',
       'meanbp', 'wblc', 'hrt', 'resp', 'temp', 'pafi', 'alb', 'bili', 'crea',
       'sod', 'ph', 'glucose', 'bun', 'urine', 'adlp', 'adls', 'sfdm2',
       'adlsc'],
      dtype='object')

In [4]:
support2_df.hospdead.describe()

count    9105.000000
mean        0.259198
std         0.438219
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: hospdead, dtype: float64

In [6]:
# Change path as necessary.
%cd ~/Desktop/ExpO/Code

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from MedicalData import Support2DataManager

/home/gregory/Desktop/ExpO/Code


In [7]:
# Load data.
dm = Support2DataManager("../Datasets/support2.csv")

In [10]:
print(dm.X_train.shape)
print(dm.X_val.shape)
print(dm.X_test.shape)
print(4552 + 2276 * 2)

(4552, 51)
(2276, 51)
(2277, 51)
9104


## Fit some simple baseline

In [46]:
# Fit and predict using a simple model (L1-regularized logistic regression).
model = LogisticRegression(penalty="l1", C=0.05, multi_class="multinomial", solver="saga", max_iter=5000, verbose=1)
model.fit(dm.X_train, np.nonzero(dm.y_train)[1])

p_train = model.predict(dm.X_train)
p_val = model.predict(dm.X_val)
p_test = model.predict(dm.X_test)

convergence after 33 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [47]:
# Evaluate.
print("Train acc:", accuracy_score(np.nonzero(dm.y_train)[1], p_train))
print("Validation acc:", accuracy_score(np.nonzero(dm.y_val)[1], p_val))
print("Test acc:", accuracy_score(np.nonzero(dm.y_test)[1], p_test))

Train acc: 0.812829525483304
Validation acc: 0.7943760984182777
Test acc: 0.8146684233640755


## Fit random forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [96]:
# Fit and predict using a random forest.
model = RandomForestClassifier(n_estimators=1000, max_depth=12, min_samples_split=8, min_samples_leaf=4, n_jobs=4, verbose=1)
model.fit(dm.X_train, np.nonzero(dm.y_train)[1])

p_train = model.predict(dm.X_train)
p_val = model.predict(dm.X_val)
p_test = model.predict(dm.X_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:

In [97]:
# Evaluate.
print("Train acc:", accuracy_score(np.nonzero(dm.y_train)[1], p_train))
print("Validation acc:", accuracy_score(np.nonzero(dm.y_val)[1], p_val))
print("Test acc:", accuracy_score(np.nonzero(dm.y_test)[1], p_test))

Train acc: 0.9389279437609842
Validation acc: 0.7930579964850615
Test acc: 0.8111550285463329


## Fit XGBoost

In [98]:
from sklearn.ensemble import GradientBoostingClassifier

In [136]:
# Fit and predict using xgboost.
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, min_samples_split=10, min_samples_leaf=2, verbose=1)
model.fit(dm.X_train, np.nonzero(dm.y_train)[1])

p_train = model.predict(dm.X_train)
p_val = model.predict(dm.X_val)
p_test = model.predict(dm.X_test)

      Iter       Train Loss   Remaining Time 
         1           1.1821            1.62s
         2           1.1270            1.62s
         3           1.0810            1.51s
         4           1.0423            1.54s
         5           1.0094            1.78s
         6           0.9811            1.75s
         7           0.9570            1.72s
         8           0.9348            1.70s
         9           0.9157            1.69s
        10           0.8987            1.64s
        20           0.7966            1.66s
        30           0.7579            1.64s
        40           0.7301            1.51s
        50           0.7117            1.37s
        60           0.6958            1.27s
        70           0.6826            1.17s
        80           0.6709            1.04s
        90           0.6607            0.94s
       100           0.6501            0.85s
       200           0.5652            0.00s


In [137]:
# Evaluate.
print("Train acc:", accuracy_score(np.nonzero(dm.y_train)[1], p_train))
print("Validation acc:", accuracy_score(np.nonzero(dm.y_val)[1], p_val))
print("Test acc:", accuracy_score(np.nonzero(dm.y_test)[1], p_test))

Train acc: 0.8811511423550088
Validation acc: 0.7939367311072056
Test acc: 0.8028107158541942
