In [None]:
import pandas as pd
import sklearn

In [None]:
# This code souldn't be in the Model API  as it here just to mimic usage.
data = pd.read_csv('../data/dataset.csv')
data.head()

In [None]:
data.shape

In [None]:
true_values = data.hospital_death
train_data = data.drop('hospital_death', axis=1)

<h5>Split for Train and Test</h5>

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, true_values, test_size=0.2, stratify=true_values)

# Training and Testing

### Initializing models and models parameters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

import time


# Preparing a list of models:

logisticRegressionIterNum = 100

# Preparing a list of estimators for Voting Classifier 
estimators = [
    ('lr',LogisticRegression()),
    ('dtc',DecisionTreeClassifier()),    
    ('rfc',RandomForestClassifier()),
    ('knc',KNeighborsClassifier())
]

classifiers = [
    LogisticRegression(max_iter=100),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
    VotingClassifier(estimators=estimators),
    SVC()
]

labels = [
    "LR",
    "RandomForest",
    "ExtraTrees",
    "GradientBoosting",
    "Votingr",
    "SVC"
]

Copied this from Kaggle to solve the our lack of one hot encoding

In [None]:
from imblearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

cat_col = [
    'ethnicity',
    'gender',
    'icu_admit_source',
    'icu_stay_type',
    'icu_type',
    'apache_3j_bodysystem',
    'apache_2_bodysystem'
]

num_col = [
    'age',
    'bmi',
    'elective_surgery',
    'height',
    'icu_id',
    'pre_icu_los_days',
    'weight',
    'apache_2_diagnosis',
    'apache_3j_diagnosis',
    'apache_post_operative',
    'arf_apache',
    'gcs_eyes_apache',
    'gcs_motor_apache',
    'gcs_unable_apache',
    'gcs_verbal_apache',
    'heart_rate_apache',
    'intubated_apache',
    'map_apache',
    'resprate_apache',
    'temp_apache',
    'ventilated_apache',
    'd1_diasbp_max',
    'd1_diasbp_min',
    'd1_diasbp_noninvasive_max',
    'd1_diasbp_noninvasive_min',
    'd1_heartrate_max',
    'd1_heartrate_min',
    'd1_mbp_max',
    'd1_mbp_min',
    'd1_mbp_noninvasive_max',
    'd1_mbp_noninvasive_min',
    'd1_resprate_max',
    'd1_resprate_min',
    'd1_spo2_max',
    'd1_spo2_min',
    'd1_sysbp_max',
    'd1_sysbp_min',
    'd1_sysbp_noninvasive_max',
    'd1_sysbp_noninvasive_min',
    'd1_temp_max',
    'd1_temp_min',
    'h1_diasbp_max',
    'h1_diasbp_min',
    'h1_diasbp_noninvasive_max',
    'h1_diasbp_noninvasive_min',
    'h1_heartrate_max',
    'h1_heartrate_min',
    'h1_mbp_max',
    'h1_mbp_min',
    'h1_mbp_noninvasive_max',
    'h1_mbp_noninvasive_min',
    'h1_resprate_max',
    'h1_resprate_min',
    'h1_spo2_max',
    'h1_spo2_min',
    'h1_sysbp_max',
    'h1_sysbp_min',
    'h1_sysbp_noninvasive_max',
    'h1_sysbp_noninvasive_min',
    'd1_glucose_max',
    'd1_glucose_min',
    'd1_potassium_max',
    'd1_potassium_min',
    'apache_4a_hospital_death_prob',
    'apache_4a_icu_death_prob',
    'aids',
    'cirrhosis',
    'diabetes_mellitus',
    'hepatic_failure',
    'immunosuppression',
    'leukemia',
    'lymphoma',
    'solid_tumor_with_metastasis',
    'hospital_death'
]

preprocess_pipeline = ColumnTransformer(transformers=
                                        [('num', SimpleImputer(strategy='median'),num_col),
                                        ('cat',OneHotEncoder(),cat_col)]
                                       )



## Train

In [None]:

# Array to save the fit running times of the model.
fit_running_times = []

def train(x_train, y_train):

    # Fit Loop
    i = 0
    for classifier in classifiers:
        i = i + 1
        clf_pipline = Pipeline([("preprocessor", preprocess_pipeline),("classifier", classifier)])
        start_time = time.time()
        clf_pipline.fit(x_train, y_train)
        end_time = time.time()

        fit_running_times.append(f'{round(end_time-start_time,2)}s')

train(x_train=x_train, y_train=y_train)


fit_running_times

## Test

In [None]:

def test(x_test, y_test):

    score = []
    names = []

    # Arrays to keep evaluations form the metrics we use.
    precision_score = []
    recall_score = []
    f1_score = []
    accuracy_score = []

    # Prediction loop

    i = 0
    for classifier in classifiers:
        
        i = i + 1
        
        precision_score.append(metrics.precision_score(y_test, classifier.predict(x_test)))
        recall_score.append(metrics.recall_score(y_test, classifier.predict(x_test)))
        f1_score.append( metrics.f1_score(y_test, classifier.predict(x_test)))
        accuracy_score.append(metrics.accuracy_score(y_test, classifier.predict(x_test)))
        names.append(f'{labels[i-1]}')
        # oversampling.append(f'{over}')

    results_dataFrame = {
        'precision_score': precision_score, 
        'recall_score': recall_score, 
        'f1_score': f1_score,
        'accuracy_score' : accuracy_score,
        # 'oversampling': oversampling,
        'time ': fit_running_times
    }

    results_dataFrame = pd.DataFrame(data=results_dataFrame)
    results_dataFrame.insert(loc=0, column='Method', value=names)

    return results_dataFrame

pred_results = test(x_test=x_test, y_test=y_test)
pred_results