In [79]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense,Input
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.impute import (SimpleImputer, KNNImputer, IterativeImputer)
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MinMaxScaler
train_features = pd.read_csv('train_features.csv', index_col=0)
train_labels   = pd.read_csv('train_labels.csv', index_col=0)
test_features  = pd.read_csv('test_features.csv', index_col=0)


scaler = MinMaxScaler()

np.set_printoptions(precision = 6)


x_train = train_features.values[:,1:]
y_train = train_labels.values
x_test = test_features.values[:,1:]

Id_pat = train_features.index
Id_lab = train_labels.index
Id_pat_test = test_features.index

In [80]:
train_mean = np.nanmean(x_train, axis = 0)
test_mean = np.nanmean(x_test, axis = 0)

unique_ids_train = pd.unique(Id_pat)
unique_lab_ids_train = pd.unique(Id_lab)
unique_ids_test = pd.unique(Id_pat_test)


In [81]:
# helper functions

def pat_data_arr(index):
    return x_train[index*12:index*12+12, :]

def pat_data_test_arr(index):
    return x_test[index*12:index*12+12, :]

In [82]:
# training data imputation
x_imputed = np.zeros((int(x_train.shape[0]/12), 35))

for pid_idx in range(len(unique_ids_train)):
    # get training data for a patient (dim 12 x d)
    x_patient = pat_data_arr(pid_idx)
    x_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_patient[:, col]).sum()
        
        if nan_count<12:
            x_patient_imputed[col]=np.nanmean(x_patient, axis = 0)[col]
        else:
            x_patient_imputed[col] = -1           
    # Concatenate imputed data for every patient
    x_imputed[pid_idx, :] = x_patient_imputed
    

    
print(pd.DataFrame(x_imputed).to_csv("train_data_imp.csv"))

  x_patient_imputed[col]=np.nanmean(x_patient, axis = 0)[col]
None


In [83]:
# test data imputation
x_test_imputed = np.zeros((int(x_test.shape[0]/12), 35))

for pid_idx in range(len(unique_ids_test)):
  
    x_test_patient = pat_data_test_arr(pid_idx)
    x_test_patient_imputed = np.array([0.]*35)
    
   
    for col in range(len(x_test_patient[1, :])):
  
        nan_count = np.isnan(x_test_patient[:, col]).sum()
        
        if nan_count<12:
            x_test_patient_imputed[col]=np.nanmean(x_test_patient, axis = 0)[col]
        else:
            x_test_patient_imputed[col] = -1           
 
    x_test_imputed[pid_idx, :] = x_test_patient_imputed
    
print(pd.DataFrame(x_test_imputed).to_csv("test_data_imp.csv"))

  x_test_patient_imputed[col]=np.nanmean(x_test_patient, axis = 0)[col]
None


In [84]:
Labels=['pid','LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
output = pd.DataFrame(np.array([[0]*16]), columns=Labels)


In [85]:
# task 1/2 classification
dataset = pd.read_csv("train_data_imp.csv")
X = dataset
y = y_train[:, 0:11]

# --------------------------------------------- NEural Network Trial ----------------------------------
# model = Sequential()
# model.add(Input(shape = (None,36)))
# model.add(Dense(10,input_dim = 35, activation = "relu"))
# model.add(Dense(10, activation = "relu"))
# model.compile(loss="binary_crossentropy", optimizer = "adam", metrics = ["auc"])
# model.fit(X, y, epochs = 150, batch_size = 10)
# predictions = model.predict(X)
# -----------------------------------------------------------------------------------------------------

classifier = RandomForestClassifier(min_samples_leaf=20, class_weight="balanced", n_estimators = 100)
ovr_c = OneVsRestClassifier(classifier)
ovr_c.fit(X,y)
result_test = ovr_c.predict_proba(pd.read_csv("test_data_imp.csv"))



# Write in the output dataframe the obtained values
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis']
i=0
for pid in unique_ids_test:
    output.loc[i,'pid']=pid
    output.loc[i,TESTS]=result_test[i,:]
    i+=1


In [88]:
# TASk C
def taskC_regressor(alpha_):
    c=Ridge(alpha=alpha_)


    num_fold = 10
    kf = KFold(n_splits=num_fold)
    score = 0
    for train_index, val_index in kf.split(x_imputed): 
        x_train = x_imputed[train_index]
        x_train_scaled = scaler.fit_transform(x_train)
        x_val = x_imputed[val_index]
        x_val_scaled = scaler.transform(x_val)
        
        y_train_set = y_train[train_index, 11:15]
        y_val = y_train[val_index, 11:15]
 
        c.fit(x_train_scaled, y_train_set) 
        y_est = np.transpose(c.predict(x_val_scaled)) 
        
        score += np.mean([0.5 + 0.5*np.maximum(0, r2_score(y_val[:,k],y_est[k,:])) for k in range(4)])
        print(np.mean([0.5 + 0.5*np.maximum(0, r2_score(y_val[:,k],y_est[k,:])) for k in range(4)]))


    avg_score = score/num_fold

    return avg_score

alpha_values = np.logspace(-3, 3, num=50, base=10.) 

avg_scores = [taskC_regressor(alpha) for alpha in alpha_values]


best_alpha = alpha_values[np.argmax(avg_scores)]



x_tot_scaled = scaler.fit_transform(x_imputed)
y_tot = y_train[:, 11:15]

c=Ridge(alpha=best_alpha)
c.fit(x_tot_scaled, y_tot)

x_test_scaled=scaler.transform(x_test_imputed)
y_est_test=(c.predict(x_test_scaled))



i=0
for pid in unique_ids_test:
    output.loc[i,'pid']=int(pid)
    output.loc[i,['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']]=y_est_test[i, :]
    i+=1

pd.DataFrame(output).to_csv("result.csv", float_format='%.3f')


[1.000000e-03 1.325711e-03 1.757511e-03 2.329952e-03 3.088844e-03
 4.094915e-03 5.428675e-03 7.196857e-03 9.540955e-03 1.264855e-02
 1.676833e-02 2.222996e-02 2.947052e-02 3.906940e-02 5.179475e-02
 6.866488e-02 9.102982e-02 1.206793e-01 1.599859e-01 2.120951e-01
 2.811769e-01 3.727594e-01 4.941713e-01 6.551286e-01 8.685114e-01
 1.151395e+00 1.526418e+00 2.023590e+00 2.682696e+00 3.556480e+00
 4.714866e+00 6.250552e+00 8.286428e+00 1.098541e+01 1.456348e+01
 1.930698e+01 2.559548e+01 3.393222e+01 4.498433e+01 5.963623e+01
 7.906043e+01 1.048113e+02 1.389495e+02 1.842070e+02 2.442053e+02
 3.237458e+02 4.291934e+02 5.689866e+02 7.543120e+02 1.000000e+03]
0.7085706765883185
0.6999179167391355
0.7093291396871811
0.7036721689088992
0.6972513745778621
0.7120183498531045
0.6942910560400724
0.520018107825911
0.6954636250882666
0.7018219022964406
0.7085706204897425
0.6999180342450095
0.7093290761373627
0.7036721154048511
0.697251507244249
0.7120181836440312
0.6942913039529075
0.5200182150979813

In [99]:

pd.DataFrame(output.drop(output.columns[1])).to_csv("result1.csv", float_format='%.3f')


KeyError: "['LABEL_BaseExcess'] not found in axis"