In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Random dataset

In [2]:
def random_bool(size):
    return np.random.rand(size) > 0.5

def random_int(min, max, size):
    return np.random.randint(min, max, size=size)

def random_float(min, max, size):
    return np.random.uniform(min, max, size=size)


def index_id(max):
    list = []
    for i in range(1, max):
        list.append('ID' + str(i))
    return list

In [7]:
pre_op = pd.DataFrame({'IDno': index_id(51), 'Død inden for 1 år af operation': random_bool(50), 'age': random_int(18, 89, 50), 'sex': random_bool(50), 'Respiratory Disease': random_bool(50),
       'Circulatory Organ Disease': random_bool(50), 'Type 1 Diabetes': random_bool(50), 'Type 2 Diabetes': random_bool(50),
       'Other metabolic diseases': random_bool(50), 'Other operation': random_bool(50), 'Genital or urine related diseases': random_bool(50), 'Vægt': random_float(35, 190,50), 'KFN': random_bool(50), 'KFX': random_bool(50), 'KFK': random_bool(50), 'KFM': random_bool(50),
       'KFF': random_bool(50), 'KFP': random_bool(50), 'KFC': random_bool(50), 'KFW': random_bool(50), 'KFJ': random_bool(50), 'Hæmoglobin': random_float(4,12,50), 'Leukocytter': random_float(1, 125, 50),
       'Trombocytter': random_float(20, 1000, 50)})
pre_op = pre_op.replace(True, 1).replace(False, 0)

X = pre_op.loc[:, pre_op.columns != 'Død inden for 1 år af operation']
X = X.loc[:, X.columns != 'IDno']
y = pre_op['Død inden for 1 år af operation']

In [14]:
def standardize_scale(df, scale_cols):
    scaler = StandardScaler()
    for col in scale_cols:
        df[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df[col])),columns=[col])
    return df

In [15]:
scale_cols = ['age','Vægt', 'Hæmoglobin', 'Leukocytter','Trombocytter']
standardize_scale(X, scale_cols)


Unnamed: 0,age,sex,Respiratory Disease,Circulatory Organ Disease,Type 1 Diabetes,Type 2 Diabetes,Other metabolic diseases,Other operation,Genital or urine related diseases,Vægt,...,KFK,KFM,KFF,KFP,KFC,KFW,KFJ,Hæmoglobin,Leukocytter,Trombocytter
0,0.647561,1,0,0,1,1,1,0,1,1.046883,...,0,1,1,0,0,1,1,1.133735,-1.014517,-1.30372
1,0.29941,0,1,0,1,0,1,1,1,-0.645066,...,0,1,0,1,0,0,0,-1.511527,1.569545,0.012319
2,-0.993723,0,1,0,1,1,1,0,0,-0.612627,...,0,0,1,1,1,0,0,0.719156,-0.480324,0.452553
3,-1.690025,0,0,1,0,0,0,1,0,0.040288,...,0,1,0,0,0,1,0,0.723602,-1.521522,-0.997132
4,0.995712,0,0,1,1,1,0,0,1,0.997283,...,1,1,0,0,1,1,0,-2.090775,1.249766,0.337474
5,0.498354,0,0,1,0,0,1,0,1,-1.14247,...,1,0,1,1,0,0,0,1.009766,1.796889,0.155709
6,-1.540818,1,0,0,1,0,1,0,0,0.045982,...,1,1,1,0,0,0,0,1.071954,0.848349,-0.956228
7,-1.590554,1,0,1,0,0,0,1,0,-1.629307,...,1,1,1,0,1,0,0,1.314073,-0.508281,0.952457
8,1.194656,1,0,1,1,1,1,0,1,-1.045929,...,1,1,1,0,0,1,1,0.19673,-0.912019,-1.430634
9,0.945976,1,1,1,0,1,1,1,0,0.846025,...,0,1,1,0,0,1,0,-1.031236,1.03484,1.463924


# Predicting Mortality with pre_op data

## LR

In [17]:
from sklearn.linear_model import LogisticRegression

#TODO other params?
def lr_pred(X_train, y_train, X_test):
    lr = LogisticRegression()
    lr_grid = {
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
            'C': [100, 10, 1.0, 0.1, 0.01],
                'max_iter': [100, 500, 1000]}
    
    grid_result = GridSearchCV(estimator = lr, param_grid = lr_grid, scoring='accuracy', cv=5)
    grid_result.fit(X_train,y_train)
    
    best_model = grid_result.best_estimator_ 

    lr_preds = best_model.predict_proba(X_test)[:,1]
    return lr_preds

    #TODO Take Best result of the fit or can i just return the predict 

## Random forrest

In [26]:
from sklearn.ensemble import RandomForestClassifier

def rf_pred(X_train, y_train, X_test):
    rf = RandomForestClassifier()

    rf_grid = {'max_depth': [80, 90, 100, 110],
    'max_features': [2,5,10,15,22],
    'n_estimators': [100, 200, 300]}
    
    grid_result = GridSearchCV(estimator = rf, param_grid = rf_grid, scoring='accuracy', cv=5)
    grid_result.fit(X_train,y_train)
    
    best_model = grid_result.best_estimator_ 

    rf_preds = best_model.predict_proba(X_test)[:,1]
    return rf_preds
    

## Gradient boost


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

def gbc_pred(X_train, y_train, X_test):
    
    gbc = GradientBoostingClassifier()
    gbc_grid = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
        }
    
    grid_result = GridSearchCV(estimator = gbc, param_grid = gbc_grid, scoring='accuracy', cv=5)
    grid_result.fit(X_train,y_train)
    
    best_model = grid_result.best_estimator_ 

    gbc_preds = best_model.predict_proba(X_test)[:,1]
    return gbc_preds

## MLP

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

def mlp_pred(X_train, y_train, X_test):
   
    mlp = MLPClassifier()
    mlp_grid = {
        'hidden_layer_sizes': [(50,), (100,), (200,)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
    }

    grid_result = GridSearchCV(estimator=mlp, param_grid=mlp_grid, scoring='accuracy', cv=5)
    grid_result.fit(X_train, y_train)

    best_model = grid_result.best_estimator_

    mlp_preds = best_model.predict_proba(X_test)[:, 1]
    return mlp_preds

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,_ = train_test_split(X,y, test_size=0.3)
rf_pred(X_train,y_train,X_test)

array([0.57, 0.44, 0.35, 0.35, 0.34, 0.17, 0.36, 0.49, 0.59, 0.36, 0.52,
       0.5 , 0.49, 0.36, 0.51])

# Grouped cross-validation

In [43]:
from perfomance_eval import evaluate_model

evaluate_model(X_withid = pre_op, X = X, y = y, pred_func = rf_pred, model_name = 'rf')

TypeError: evaluate_model() got an unexpected keyword argument 'X_withid'