In [101]:
import numpy as np
import pandas as pd
from sklearn.impute import (SimpleImputer, KNNImputer)
from sklearn.feature_selection import SelectKBest

In [244]:

df_train_features = pd.read_csv('./train_features.csv')
df_train_labels   = pd.read_csv('./train_labels.csv')
df_test_features  = pd.read_csv('./test_features.csv')
X = df_train_features.values
label=df_train_labels.values
X_test = df_test_features.values
X.shape

(227940, 37)

In [245]:
idf=df_train_features.iloc[:,0]
ifl=df_train_labels.iloc[:,0]
idt=df_test_features.iloc[:,0]
idf_unique = pd.unique(idf)
idt_unique=pd.unique(idt)

In [246]:
def preprocessing(X, n_neighbors=75, method='KNN'):

    X_std= np.nanstd(X, axis=0, keepdims=True)
    X_mean= np.nanmean(X, axis=0, keepdims=True)
    X_norm= (X- X_mean)/X_std

    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights='distance') if method =='KNN'\
        else SimpleImputer (missing_values=np.nan, strategy='median')

    X_norm_fixed= imputer.fit_transform(X_norm)

    return X_norm_fixed

In [247]:
X_norm_fixed = preprocessing(X, method = 'median')
X_test_fixed = preprocessing(X_test, method = 'median')
print(X_norm_fixed)
pid_idx = np.where(X[:,0]==10)
print(pid_idx)

[[-1.7250923  -0.85121278 -1.70642596 ... -0.28285627  0.84344393
  -0.50054209]
 [-1.7250923  -0.63917285 -1.70642596 ... -0.28285627  0.113008
  -0.50054209]
 [-1.7250923  -0.42713292 -1.70642596 ... -0.28285627 -0.53149428
   0.03721993]
 ...
 [-0.63263872  0.63306673  1.39353544 ... -0.28285627  0.67157665
   0.03721993]
 [-0.63263872  0.84510666  1.39353544 ... -0.28285627  0.113008
   0.03721993]
 [-0.63263872  1.05714659  1.39353544 ... -0.28285627  0.24190846
   0.03721993]]
(array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], dtype=int64),)


In [248]:
def get_patient_data(pid):
    pid_idx = np.where(X[:,0]==pid)
    return X_norm_fixed[pid_idx, :].reshape([12,37])

def get_patient_data_test(pid_index):
    pid_idx = np.where(X_test[:,0]==pid_index)
    return X_test_fixed[pid_idx, :].reshape([12,37])

In [124]:
X_1=get_patient_data(10)
X_1.shape

(12, 37)

In [121]:
X_norm_fixed[12,:]

array([-1.7241089 , -1.27529265,  0.54256564,  0.01498224, -0.30311906,
       -0.30926529, -0.31286483,  0.16895924, -0.06179241,  0.09353697,
        0.05707343, -0.03058222, -0.22179091, -0.1535099 , -0.13268913,
       -0.31388775, -0.12494276, -0.23059474, -0.00822398, -0.15041668,
        0.3664487 , -0.21389641, -0.1285392 , -0.00948796, -0.078682  ,
       -0.14473443,  0.36943647, -0.21013043,  0.12079291, -0.39083107,
        0.12505594, -0.06638969, -0.0862856 , -0.25924228, -0.28285627,
       -0.14479291,  0.03721993])

Sub-task 1
<font color = blue>
    

In [249]:
#--training data preprocessing
x_imputed = np.zeros((int(df_train_features.shape[0]/12), 35))

for pid_idx in range(len(idf_unique)):
    # get training data for a patient (dim 12 x d)
    x_patient = get_patient_data(idf_unique[pid_idx])
    x_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_patient[1, 2:])):

         x_patient_imputed[col]=np.mean(x_patient, axis = 0)[col]
         
    # Concatenate imputed data for every patient
    
    x_imputed[pid_idx, :] = x_patient_imputed
    
print(x_imputed)


[[-1.7250923   0.31500683 -1.70642596 ... -0.42163303 -0.25924228
  -0.28285627]
 [-1.7241089  -0.10907303  0.54256564 ... -0.30355295 -0.24639742
  -0.2833198 ]
 [-1.71427485  0.1029669   0.36021497 ...  1.39207704 -0.25924228
  -0.28285627]
 ...
 [-0.63296652  1.16316656 -0.55153838 ...  0.79223022 -0.25924228
  -0.28285627]
 [-0.63274798 -0.10907303  1.63666967 ...  0.66470373 -0.25924228
  -0.27388363]
 [-0.63263872 -0.10907303  1.39353544 ... -0.33661537 -0.25924228
  -0.28285627]]


In [179]:
# test data preprocessing
x_test_imputed = np.zeros((int(df_test_features.shape[0]/12), 35))

for pid_idx in range(len(idt_unique)):
   
    x_test_patient = get_patient_data_test(idt_unique[pid_idx])
    x_test_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_test_patient[1, 2:])):
      
        x_test_patient_imputed[col]=np.mean(x_test_patient, axis = 0)[col]
            
    # Concatenate imputed data for every patient
    x_test_imputed[pid_idx, :] = x_test_patient_imputed
    
print(x_test_imputed)

[[-1.74229257 -0.11283712 -1.41719456 ... -0.02989306  0.91381053
  -0.2958202 ]
 [-0.64566007 -0.11283712 -0.0094439  ...  1.2601508  -0.26411003
  -0.2958202 ]
 [-0.64544076 -0.11283712  0.6026216  ...  0.33056037 -0.26411003
  -0.2958202 ]
 ...
 [-0.64664694  0.33801295 -0.68271596 ... -0.14846327 -0.26411003
  -0.2958202 ]
 [-0.64642763  0.11258791 -0.62150941 ...  0.89495456 -0.2514714
  -0.2958202 ]
 [-0.64609868 -0.11283712 -0.31547666 ...  0.117134   -0.26411003
  -0.2958202 ]]


In [180]:
x_imputed.shape

(18995, 35)

In [250]:
label_2=df_train_labels.iloc[:,11]
label_2

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
18990    0.0
18991    0.0
18992    0.0
18993    0.0
18994    0.0
Name: LABEL_Sepsis, Length: 18995, dtype: float64

Subtask-1 and 2

In [251]:
Labels=['pid','LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
result = pd.DataFrame(np.array([[0]*16]), columns=Labels)

In [221]:
y_train.shape

(18995, 11)

In [242]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (HistGradientBoostingClassifier,RandomForestClassifier)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics


for i, label in enumerate(task12_labels):
    pipeline = make_pipeline(
                        StandardScaler(),
                        RandomForestClassifier(min_samples_leaf=20, class_weight='balanced', n_estimators=100))
    scores = cross_val_score(pipeline, x_imputed, y_train[:, i],
                                cv=5,
                                scoring='roc_auc')
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [252]:
l=label[:,11]
print(l)
df_train_labels.iloc[:,11]

[0. 0. 0. ... 0. 0. 0.]


0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
18990    0.0
18991    0.0
18992    0.0
18993    0.0
18994    0.0
Name: LABEL_Sepsis, Length: 18995, dtype: float64

In [257]:
from tqdm import tqdm

In [None]:
# Fit the Random Forest on the whole training set
y_test=np.zeros((int(df_test_features.shape[0]/12), 11))
for i in tqdm(range(12)):
    y_train = label[:, i]

    c=RandomForestClassifier(min_samples_leaf=20, class_weight='balanced', n_estimators=100)


#     brm_clf = OneVsRestClassifier(c)
    c.fit(x_imputed, y_train)

    # Predict the Labels for the test features

    y_test[:,i]=brm_clf.predict_proba(x_test_imputed)


  0%|          | 0/12 [00:00<?, ?it/s]

In [255]:
y_tot.shape

(18995, 12)

In [253]:
x_tot = x_imputed
y_tot = label[:,0:12]

c=RandomForestClassifier(min_samples_leaf=20, class_weight='balanced', n_estimators=100)
# c.fit(x_tot,y_tot)

brm_clf = OneVsRestClassifier(c)
brm_clf.fit(x_tot, y_tot)

# Predict the Labels for the test features
x_test_tasksA_B = x_test_imputed
y_test2=brm_clf.predict_proba(x_test_tasksA_B)

ValueError: Multioutput target data is not supported with label binarization

In [241]:
y_tot

array([0., 0., 0., ..., 0., 0., 0.])

In [237]:
Task2 = ['LABEL_Sepsis']
i=0
for pid in idt_unique:
    
    result.loc[i,Task2]=y_test2[i,:]
    i+=1

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis']
i=0
for pid in idt_unique:
    result.loc[i,'pid']=pid
    result.loc[i,TESTS]=y_est_test[i,:]
    i+=1

In [None]:
df = pd.DataFrame({'pid': df_test_features.iloc[0::12, 0].values})
for i, label in enumerate(task1_labels_ids):
    pipeline = pipeline.fit(x_train, y_train[:, i].ravel())
    print("Training score:", metrics.roc_auc_score(y_train[:, i], pipeline.predict_proba(x_train)[:, 1]))
    predictions = pipeline.predict_proba(x_test)[:, 1]
    df[label] = predictions

In [203]:
# Save the output (predicted labels for the test features)
# output.to_csv('prediction2.csv', index=False, float_format='%.3f')
compression_opts = dict(method='zip', archive_name='prediction.csv')
result.to_csv('prediction.zip', index=False, float_format='%.3f', compression=compression_opts)