In [1]:
import pandas as pd
import os
from matplotlib import pyplot as plt

In [2]:
df = pd.read_parquet("/h/dsmith/physionet_data/preppedf/physionet_data.parquet", engine="pyarrow")
npts = 10
df

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,SepsisEver,id
0,77.0,98.0,37.060000,155.0,101.67,64.263019,20.0,33.017979,-0.441724,25.0,...,238.0,76.30,0.0,1.0,0.0,-0.03,1.0,0.0,0,7002
1,77.0,98.0,37.060000,155.0,101.67,64.263019,20.0,33.017979,-0.441724,25.0,...,238.0,76.30,0.0,1.0,0.0,-0.03,2.0,0.0,0,7002
2,73.0,99.0,37.060000,140.0,88.00,64.263019,22.0,33.017979,-0.441724,25.0,...,238.0,76.30,0.0,1.0,0.0,-0.03,3.0,0.0,0,7002
3,66.0,95.0,37.060000,132.0,84.00,64.263019,19.0,33.017979,-0.441724,25.0,...,238.0,76.30,0.0,1.0,0.0,-0.03,4.0,0.0,0,7002
4,66.0,98.0,37.021667,147.0,89.00,64.263019,20.0,33.017979,-0.441724,25.0,...,238.0,76.30,0.0,1.0,0.0,-0.03,5.0,0.0,0,7002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937423,86.0,99.0,36.225000,104.0,64.00,64.263019,26.0,33.017979,-0.441724,28.0,...,211.0,77.26,1.0,1.0,0.0,-0.03,16.0,0.0,0,16000
937424,89.0,98.0,36.280000,110.0,66.67,64.263019,26.0,33.017979,-0.441724,28.0,...,211.0,77.26,1.0,1.0,0.0,-0.03,17.0,0.0,0,16000
937425,89.0,99.0,36.335000,100.0,65.33,64.263019,24.0,33.017979,-0.441724,28.0,...,211.0,77.26,1.0,1.0,0.0,-0.03,18.0,0.0,0,16000
937426,86.0,99.0,36.390000,99.0,66.33,64.263019,26.0,33.017979,-0.441724,28.0,...,211.0,77.26,1.0,1.0,0.0,-0.03,19.0,0.0,0,16000


In [3]:
badcols = ["ICULOS", "SepsisLabel"]
slim = df[[i for i in df.columns if i not in badcols]]
variations = slim.groupby("id").std().mean()
static_vars = variations[variations == 0].index.tolist()
puny_vars = variations[(0 < variations) & (variations < 0.05)].index.tolist()
moving_vars = [i for i in variations.index.tolist() if i not in puny_vars + static_vars]

In [4]:
data_groups = {
    "labels": slim.groupby("id")["SepsisEver"].tail(1),
    "skew": slim[moving_vars + ["id"]].groupby("id").tail(npts).groupby("id").skew().reset_index(),
    "std": slim[moving_vars + ["id"]].groupby("id").tail(npts).groupby("id").std().reset_index(),
    "mean": slim[moving_vars + ["id"]].groupby("id").tail(npts).groupby("id").mean().reset_index(),
    "static": slim[static_vars + ["id"]].groupby("id").tail(1).reset_index(),
}

In [5]:
var_sets = {
    "kitchen_sink": ["std", "mean", "static", "skew"],
    "static": ["static"],
    "moving": ["mean", "std", "kurt", "skew"],
    "tail": ["mean"]
}

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_validate

class BoostModel(object):
    
    def __init__(self, X, y, group="none", nparts=10):
        self.Xdata = X
        self.ydata = y
        self.nparts = nparts
        self.results = pd.DataFrame()
        self.dgroup = group
        
    def optimize(self):
        depths = [2, 3, 5, 7, 10]
        min_leafs = [2, 8, 32, 128, 256]
        
        for i in depths:
            for j in min_leafs:
                model = RFC(max_depth=i, min_samples_leaf=j, class_weight="balanced")
                cv_results = cross_validate(model, self.Xdata, self.ydata, cv=self.nparts, scoring=("f1", "accuracy"))
                self.results = self.results.append(
                    {
                        "depth": i,
                        "min_leafs": j,
                        "data_group": self.dgroup,
                        "test_acc": cv_results["test_accuracy"].mean(),
                        "test_acc_std": cv_results["test_accuracy"].std(),
                        "test_f1": cv_results["test_accuracy"].mean(),
                        "test_f1_std": cv_results["test_accuracy"].std(),
                    },
                    ignore_index=True
                )
                print(
                    "Random Forest with depth {} and min_leafs {} got accuracy = {} +/- {} on {}.".format(
                        i, j, cv_results["test_accuracy"].mean(), cv_results["test_accuracy"].std(), self.dgroup
                    )
                )
        
        
def assemble_data(var_set):
    chosen_sets = var_sets[var_set]
    
    dataset = data_groups[chosen_sets[0]].copy()
    for i in range(1, len(chosen_sets)):
        dataset = dataset.merge(data_groups[chosen_sets[i]], on="id")
    
    del dataset["id"]
    return dataset

 
all_results = pd.DataFrame()
for i in var_sets:
    print("\nPreparing {}...".format(i))
    xdata = assemble_data(i)
    print("Data prepped. Running models.")
    BM = BoostModel(xdata, data_groups["labels"], group=i)
    BM.optimize()
    all_results = all_results.append(BM.results, ignore_index=True)
    
all_results.to_csv("model_opt_results.csv", index=False)


Preparing kitchen_sink...
Data prepped. Running models.
Random Forest with depth 2 and min_leafs 2 got accuracy = 0.5932604184588528 +/- 0.07904975650584353 on kitchen_sink.
Random Forest with depth 2 and min_leafs 8 got accuracy = 0.5746047049344741 +/- 0.07896453159798568 on kitchen_sink.
