In [94]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense,Input
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import (SimpleImputer, KNNImputer, IterativeImputer)
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
train_features = pd.read_csv('train_features.csv', index_col=0)
train_labels   = pd.read_csv('train_labels.csv', index_col=0)
test_features  = pd.read_csv('test_features.csv', index_col=0)

x_train = train_features.values[:,1:]
y_train = train_labels.values
x_test = test_features.values[:,1:]

Id_pat = train_features.index
Id_lab = train_labels.index
Id_pat_test = test_features.index

In [95]:
train_mean = np.nanmean(x_train, axis = 0)
test_mean = np.nanmean(x_test, axis = 0)

unique_ids_train = pd.unique(Id_pat)
unique_lab_ids_train = pd.unique(Id_lab)
unique_ids_test = pd.unique(Id_pat_test)


In [96]:
# helper functions

def pat_data_arr(index):
    return x_train[index*12:index*12+12, :]

def pat_data_test_arr(index):
    return x_test[index*12:index*12+12, :]

In [97]:
# training data imputation
x_imputed = np.zeros((int(x_train.shape[0]/12), 35))

for pid_idx in range(len(unique_ids_train)):
    # get training data for a patient (dim 12 x d)
    x_patient = pat_data_arr(pid_idx)
    x_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_patient[:, col]).sum()
        
        if nan_count<12:
            x_patient_imputed[col]=np.nanmean(x_patient, axis = 0)[col]
        else:
            x_patient_imputed[col] = -1           
    # Concatenate imputed data for every patient
    x_imputed[pid_idx, :] = x_patient_imputed
    

    
print(pd.DataFrame(x_imputed).to_csv("train_data_imp.csv"))

  x_patient_imputed[col]=np.nanmean(x_patient, axis = 0)[col]
None


In [98]:
# test data imputation
x_test_imputed = np.zeros((int(x_test.shape[0]/12), 35))

for pid_idx in range(len(unique_ids_test)):
    # get training data for a patient (dim 12 x d)
    x_test_patient = pat_data_test_arr(pid_idx)
    x_test_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_test_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_test_patient[:, col]).sum()
        
        if nan_count<12:
            x_test_patient_imputed[col]=np.nanmean(x_test_patient, axis = 0)[col]
        else:
            x_test_patient_imputed[col] = -1           
    # Concatenate imputed data for every patient
    x_test_imputed[pid_idx, :] = x_test_patient_imputed
    
print(pd.DataFrame(x_test_imputed).to_csv("test_data_imp.csv"))

  x_test_patient_imputed[col]=np.nanmean(x_test_patient, axis = 0)[col]
None


In [99]:
Labels=['pid','LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
output = pd.DataFrame(np.array([[0]*16]), columns=Labels)

In [100]:
# task 1 classification
dataset = pd.read_csv("train_data_imp.csv")
X = dataset
y = y_train[:, 0:11]
# model = Sequential()
# model.add(Input(shape = (None,36)))
# model.add(Dense(10,input_dim = 35, activation = "relu"))
# model.add(Dense(10, activation = "relu"))
# model.compile(loss="binary_crossentropy", optimizer = "adam", metrics = ["auc"])
# model.fit(X, y, epochs = 150, batch_size = 10)
# predictions = model.predict(X)

classifier = RandomForestClassifier(min_samples_leaf=20, class_weight="balanced", n_estimators = 100)
ovr_c = OneVsRestClassifier(classifier)
ovr_c.fit(X,y)
result_test = ovr_c.predict_proba(pd.read_csv("test_data_imp.csv"))



# Write in the output dataframe the obtained values
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis']
i=0
for pid in unique_ids_test:
    output.loc[i,'pid']=pid
    output.loc[i,TESTS]=result_test[i,:]
    i+=1
pd.DataFrame(output).to_csv("result.csv")