# Support vector machines

In [None]:
# import libraries

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages
from sklearn.decomposition import PCA

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_classif, mutual_info_regression, f_regression

from sklearn import preprocessing
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import pairwise_kernels

from sklearn import model_selection

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

## Data pre-processing

In [89]:
# load training data

# load data from csv file
df_train_features = pd.read_csv ('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')

# Load test data
df_test_features = pd.read_csv ('test_features.csv')

### Sorting labels

In [90]:
df_train_labels = df_train_labels.sort_values(by = 'pid')
df_train_features = df_train_features.sort_values(by = 'pid')

# Droping time
df_train_features = df_train_features.drop('Time', axis = 1)
df_test_features = df_test_features.drop('Time', axis = 1)

 ### Histogram of the output labels 

We should check for class imbalance.

In [None]:
df_train_labels.hist()

# with PdfPages("./Results/Labels_histogram.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

One can see the class imbalance problem here. Other observations:
  * Heartrate, RRate, ABPm,  distribution is similar to a normal distribution
  * SpO2 is like a censored normal distribution. 
  * For all of the other features, class imbalance is an obvious problem.

A basic strategy that could be used here: Upsample both classes! Do the upsampling efficiently, not just replicating the datapoints

### Boxplot over features

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_train_features.agg([min, max]))

# Boxplotting the data
# fig2, ax2 = plt.subplots()
# ax2.set_title('BUN')
# ax2.boxplot(df_train_features.iloc[:,5], notch=True)

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = df_train_features.iloc[:,1:])
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=90,
    horizontalalignment='right'
);

# with PdfPages("./Results/Train_columns_boxplot.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

In [None]:
# calculate the correlation matrix
corr = df_train_features.corr()

# plot the heatmap
plt.figure(figsize=(16, 16))
ax = sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, 
        vmin=-1, vmax=1, center=0, 
           cmap=sns.diverging_palette(20, 220, n=200))
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

### Visualizing pattern of missing values

In [None]:
# how much missing data? 
print("Percentage of missing values:")
print(df_train_features.isnull().sum(axis=0) / len(df_train_features))

msno.matrix(df_train_features)

# Plotting the correlation between the missing values
msno.heatmap(df_train_features)

### Train data pre-processing

In [107]:
# Which columns have less than a percent NA
NA_percent = 0.8
NA_percent_severe = 0.91

sel_features = df_train_features.isnull().sum(axis=0) < (NA_percent * df_train_features.shape[0])
inds = np.where(sel_features == True)

sel_features_2 = (df_train_features.isnull().sum(axis=0) < (NA_percent_severe * df_train_features.shape[0])) & (df_train_features.isnull().sum(axis=0) > (NA_percent * df_train_features.shape[0]))        
inds_2 = np.where(sel_features_2 == True)

sel_features_3 = df_train_features.isnull().sum(axis=0) > (NA_percent_severe * df_train_features.shape[0])
inds_3 = np.where(sel_features_3 == True)

print(inds[0])
print(inds_2[0])
print(inds_3[0])

present_columns = df_train_features.iloc[:,inds[0]]

present_columns_agg = present_columns.groupby('pid').agg([np.min, np.max, np.mean, np.std, lambda x: x.median() - x.mean()])
present_columns_agg = present_columns_agg.drop(present_columns_agg.columns[[0,1,3,4]], axis=1)

missing_columns = df_train_features.iloc[:,np.append(0,inds_2)]

missing_columns_agg = missing_columns.groupby('pid').agg([np.min, np.max, np.mean])

missing_columns_severe = df_train_features.iloc[:,np.append(0,inds_3)]

missing_columns_agg_severe = missing_columns_severe.groupby('pid').agg(np.mean)


print(present_columns_agg.shape)
print(missing_columns_agg.shape)
print(missing_columns_agg_severe.shape)

[ 0  1  6 10 20 21 24 27 31 34]
[ 7 15 17 23 30 35]
[ 2  3  4  5  8  9 11 12 13 14 16 18 19 22 25 26 28 29 32 33]
(18995, 41)
(18995, 18)
(18995, 20)


In [108]:
df_train_agg_features = pd.merge(present_columns_agg, missing_columns_agg, on="pid")
df_train_agg_features = pd.merge(df_train_agg_features, missing_columns_agg_severe, on = "pid")
print(df_train_agg_features.shape)
print(df_train_agg_features.columns)

(18995, 79)
Index([            ('Age', 'mean'),            ('Temp', 'amin'),
                  ('Temp', 'amax'),            ('Temp', 'mean'),
                   ('Temp', 'std'),      ('Temp', '<lambda_0>'),
                 ('RRate', 'amin'),           ('RRate', 'amax'),
                 ('RRate', 'mean'),            ('RRate', 'std'),
           ('RRate', '<lambda_0>'),         ('Glucose', 'amin'),
               ('Glucose', 'amax'),         ('Glucose', 'mean'),
                ('Glucose', 'std'),   ('Glucose', '<lambda_0>'),
                  ('ABPm', 'amin'),            ('ABPm', 'amax'),
                  ('ABPm', 'mean'),             ('ABPm', 'std'),
            ('ABPm', '<lambda_0>'),            ('ABPd', 'amin'),
                  ('ABPd', 'amax'),            ('ABPd', 'mean'),
                   ('ABPd', 'std'),      ('ABPd', '<lambda_0>'),
                  ('SpO2', 'amin'),            ('SpO2', 'amax'),
                  ('SpO2', 'mean'),             ('SpO2', 'std'),
            (

In [None]:
# df_train_agg_features = df_train_features.groupby('pid').agg([np.min, np.max, np.mean np.std])
# df_train_agg_features = df_train_agg_features.iloc[:,5:]
# # Removing ETCo2 mean and max since it has so many NA
# df_train_agg_features = df_train_agg_features.drop(df_train_agg_features.columns[[2,3]],  axis = 1)
# print(df_train_agg_features.columns)
# df_train_agg_features.columns
# print(int(df_train_agg_features.shape[1]))
# print(int(df_train_agg_features.shape[1]/3))

# # how much missing data? 
# print("number of missing values:")
# print(df_train_agg_features.isnull().sum(axis=0))

# na_percent_max = int(0.8 * df_train_agg_features.shape[0])
# tmp = pd.DataFrame(df_train_agg_features)
# for i in range(1, (int(df_train_agg_features.shape[1]/3))):
#     na_count = df_train_agg_features.iloc[:,i].isna().sum()
#     print(df_train_agg_features.columns[i])
#     print(na_count)
    
#     if(na_count > na_percent_max):
#         print("should be removed")


In [112]:
# impute missing data points
#imp = SimpleImputer(strategy="mean")
imputer = KNNImputer(n_neighbors = 10)
#imputer = IterativeImputer(random_state=0, verbose = 2, max_iter = 30)
df_train_agg_imputed_features = imputer.fit_transform(df_train_agg_features)
#print(df_train_agg_imputed_features)

In [113]:
# scale the data
min_max_scaler = preprocessing.StandardScaler()
# standard_scalar = preprocessing.StandardScaler()

data_train_scaled = min_max_scaler.fit_transform(df_train_agg_imputed_features)

In [None]:
# REARRANGE THE LABELS, TO MATCH THE REARRANGED FEATURES
df_train_labels_sorted = df_train_labels.sort_values(by = 'pid')
print(df_train_labels_sorted[['pid']])
print(df_train_labels[['pid']])
print(df_train_agg_features)

In [None]:
# Visualizing the training data after imputing and aggregating

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = pd.DataFrame(data_train_scaled))
ax.set_xticklabels(
    list(df_train_features),
    rotation=90,
    horizontalalignment='right'
);

In [None]:
# What is the correlation between the 
pd.DataFrame(data_train_scaled).corrwith(other = pd.DataFrame(df_train_agg_imputed_features), method = "spearman").transpose()

### PCA plot 

In [None]:
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(data_train_scaled)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, df_train_labels[[df_train_labels.columns[11]]]], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA for label i', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf[df_train_labels.columns[11]] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### Test Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_test_features.agg([min, max]))

# how much missing data? 
print("number of missing values:")
print(df_test_features.isnull().sum(axis=0))

In [117]:
# We should use the same indices as before
print(inds[0])
print(inds_2[0])
print(inds_3[0])

present_columns = df_test_features.iloc[:,inds[0]]

present_columns_agg = present_columns.groupby('pid').agg([np.min, np.max, np.mean, np.std, lambda x: x.median() - x.mean()])
present_columns_agg = present_columns_agg.drop(present_columns_agg.columns[[0,1,3,4]], axis=1)

missing_columns = df_test_features.iloc[:,np.append(0,inds_2)]

missing_columns_agg = missing_columns.groupby('pid').agg([np.min, np.max, np.mean])

missing_columns_severe = df_test_features.iloc[:,np.append(0,inds_3)]

missing_columns_agg_severe = missing_columns_severe.groupby('pid').agg(np.mean)

[ 0  1  6 10 20 21 24 27 31 34]
[ 7 15 17 23 30 35]
[ 2  3  4  5  8  9 11 12 13 14 16 18 19 22 25 26 28 29 32 33]


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [118]:
df_test_agg_features = pd.merge(present_columns_agg, missing_columns_agg, on="pid")
df_test_agg_features = pd.merge(df_test_agg_features, missing_columns_agg_severe, on = "pid")
print(df_test_agg_features.shape)
print(df_test_agg_features.columns)

(12664, 79)
Index([            ('Age', 'mean'),            ('Temp', 'amin'),
                  ('Temp', 'amax'),            ('Temp', 'mean'),
                   ('Temp', 'std'),      ('Temp', '<lambda_0>'),
                 ('RRate', 'amin'),           ('RRate', 'amax'),
                 ('RRate', 'mean'),            ('RRate', 'std'),
           ('RRate', '<lambda_0>'),         ('Glucose', 'amin'),
               ('Glucose', 'amax'),         ('Glucose', 'mean'),
                ('Glucose', 'std'),   ('Glucose', '<lambda_0>'),
                  ('ABPm', 'amin'),            ('ABPm', 'amax'),
                  ('ABPm', 'mean'),             ('ABPm', 'std'),
            ('ABPm', '<lambda_0>'),            ('ABPd', 'amin'),
                  ('ABPd', 'amax'),            ('ABPd', 'mean'),
                   ('ABPd', 'std'),      ('ABPd', '<lambda_0>'),
                  ('SpO2', 'amin'),            ('SpO2', 'amax'),
                  ('SpO2', 'mean'),             ('SpO2', 'std'),
            (

In [None]:
# # # aggregate data for each pid
# # df_test_aggregate_features = df_test_features.groupby('pid').agg('median')

# df_test_agg_features = df_test_features.groupby('pid').agg([np.min, np.max, np.mean])

# df_test_agg_features = df_test_agg_features.iloc[:,5:]
# # Removing ETCo2 mean and max since it has so many NA
# df_test_agg_features = df_test_agg_features.drop(df_test_agg_features.columns[[2,3]],  axis = 1)

In [119]:
# impute missing data points
# should we impute it with the same imputer that we've used for train?

imputer = KNNImputer(n_neighbors= 10)
#imputer = IterativeImputer(random_state=0, verbose = 1)
df_test_agg_imputed_features = imputer.fit_transform(df_test_agg_features)

In [120]:
# scale test data
min_max_scaler = preprocessing.StandardScaler()
data_test_scaled = min_max_scaler.fit_transform(df_test_agg_imputed_features)

In [None]:
# pd.DataFrame(data_train_scaled).to_csv("./Results/4stats_iterarive_dat_train_scaled.csv")
# pd.DataFrame(data_test_scaled).to_csv("./Results/4stats_iterative_dat_test_scaled.csv")

## Fit a model & Predict

### predict with support vector machine classification and use probabilities

In [133]:
i = 1
transformer =  GenericUnivariateSelect(score_func= mutual_info_classif, mode ='k_best', param=40)
train_features = pd.DataFrame(transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i]))
print("For feature ", df_train_labels.columns[i])
print(df_train_agg_features.columns[transformer.get_support(indices = True)])
test_features = pd.DataFrame(transformer.transform(data_test_scaled))

values_1 = train_features.loc[df_train_labels[df_train_labels.columns[i]] == 1]
values_0 = train_features.loc[df_train_labels[df_train_labels.columns[i]] == 0]
values_0 = resample(values_0, replace = False, n_samples = values_1.shape[0])

train_features = pd.concat([values_0, values_1])

For feature  LABEL_BaseExcess
Index([           ('Temp', 'mean'),             ('Temp', 'std'),
            ('Temp', '<lambda_0>'),           ('RRate', 'mean'),
                  ('RRate', 'std'),     ('RRate', '<lambda_0>'),
                  ('ABPm', 'mean'),      ('ABPm', '<lambda_0>'),
                  ('ABPd', 'amax'),            ('ABPd', 'mean'),
            ('ABPd', '<lambda_0>'),            ('SpO2', 'mean'),
                   ('SpO2', 'std'),      ('SpO2', '<lambda_0>'),
             ('Heartrate', 'mean'), ('Heartrate', '<lambda_0>'),
                  ('ABPs', 'mean'),      ('ABPs', '<lambda_0>'),
                   ('Hgb', 'mean'),           ('PaCO2', 'amin'),
                 ('PaCO2', 'amax'),           ('PaCO2', 'mean'),
                  ('FiO2', 'amin'),            ('FiO2', 'amax'),
                  ('FiO2', 'mean'),             ('Hct', 'amin'),
                   ('Hct', 'amax'),             ('Hct', 'mean'),
                    ('pH', 'amin'),              ('pH', 'ama

KeyError: 'pid'

In [157]:

# print(values_0)
# print(values_1)
# print(values_0.shape)
# print(values_1.shape)
# print(train_features)

    
print(np.repeat([0,1], values_0.shape[0]))

[0 0 0 ... 1 1 1]


In [None]:
# first for the labels that have an output [0,1]
test_pids = list(set(df_test_features.pid))
columns_1 = [test_pids]

from sklearn.utils import resample

for i in range(1, 12):
    
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_classif, mode ='k_best', param=40)
    train_features = pd.DataFrame(transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i]))
    print("For feature ", df_train_labels.columns[i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = pd.DataFrame(transformer.transform(data_test_scaled))

#     values_1 = train_features.loc[df_train_labels[df_train_labels.columns[i]] == 1]
#     values_0 = train_features.loc[df_train_labels[df_train_labels.columns[i]] == 0]
#     values_0 = resample(values_0, replace = False, n_samples = values_1.shape[0])

#     train_features = pd.concat([values_0, values_1])
    
#     labels = np.repeat([0,1], values_0.shape[0])
    
    #clf = BaggingClassifier(SVC(kernel = 'poly', degree = 5, class_weight = 'balanced', verbose = True, C = 10))
    clf_w = SVC(kernel = 'rbf', class_weight = 'balanced', verbose = 2)
    
    parameters = {'C':(0.1, 1, 5, 10)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                        refit = True, scoring = 'roc_auc', verbose = 2,
                                       n_jobs=6, return_train_score = True)
    clf.fit(train_features, df_train_labels.iloc[:,i])
#     clf.fit(train_features, labels)
    
#     print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    # compute probabilites as opposed to predictions
    #dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    
    distance_hyperplane = clf.decision_function(test_features)
    probability = np.empty(len(distance_hyperplane))
    for j in range(0, len(probability)):
        if distance_hyperplane[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplane[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplane[j]))
    columns_1.append(probability)

    
    distance_hyperplace_train = clf.decision_function(train_features)
    probability = np.empty(len(distance_hyperplace_train))
    for j in range(0, len(probability)):
        if distance_hyperplace_train[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplace_train[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplace_train[j]))
    
    tmp = roc_auc_score(y_score= probability, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)


In [None]:
# labels that have a real value
columns_2 = []
# from sklearn.kernel_ridge import KernelRidge

for i in range(12, 16):
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param = 50)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)
    
    clf_w = SVR(kernel = 'rbf', gamma = 'scale', cache_size = 6000)
# #     clf_w = NuSVR(nu=0.5, kernel = 'linear')
    parameters = {'C':(0.1, 1,10)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
#     clf = KernelRidge(kernel = 'poly', degree = 5)
#     parameters = {'alpha':(0.1,1,10,30)}
#     clf = model_selection.GridSearchCV(estimator= clf, param_grid = parameters, cv = 3,
#                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)

    pred_train = clf.predict(train_features)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)
    
    pred = clf.predict(test_features)
    columns_2.append(pred)
    

In [None]:
columns_final = columns_1 + columns_2

### predict with Support vector regression and then compute sigmoid function

In [None]:
# first for the labels that have an output [0,1]

# columns_1 = [test_pids]

# for i in range(1,12):
    
#     clf = SVR(kernel = 'poly', degree = 3, max_iter = 10000)
#     clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
#     pred = clf.predict(data_test_scaled)
#     prob = np.empty(len(pred))
#     for j in range(0, len(pred)):
#         prob[j] = 1 / (1 + math.exp(-pred[j]))
#     columns_1.append(prob)
    
#     pred_train = clf.predict(data_train_scaled)
#     prob_train = np.empty(len(pred_train))
#     for j in range(0, len(pred_train)):
#         prob_train[j] = 1 / (1 + math.exp(-pred_train[j]))    
#     tmp = roc_auc_score(y_score= prob_train, y_true= df_train_labels.iloc[:,i])
#     print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)


In [None]:
# #labels that have a real value

# columns_2 = []

# for i in range(12, 16):
    
#     # feature selection
#     transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param=20)
#     train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
#     print(list(data_train_scaled)[transformer.get_support()])
#     test_features = transformer.transform(data_test_scaled)
    

#     clf_w = LinearSVR()
#     parameters = {'C':(0.1,1,10,30,60,100)}
#     clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 2,
#                                        refit = True, scoring = 'r2', verbose = 1, n_jobs=6)
#     clf.fit(train_features, df_train_labels.iloc[:,i])
    
#     print(clf.cv_results_)
#     pred = clf.predict(test_features)
#     columns_2.append(pred)
    
#     pred_train = clf.predict(train_features)
#     tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
#     print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)

In [None]:
transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param=20)
train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,11])
test_features = transformer.transform(data_test_scaled)

In [None]:
df_train_agg_features.columns[transformer.get_support(indices = True)]

In [None]:
columns_final = columns_1 + columns_2

### Random forest

In [None]:
# Random forest Classifier
columns_1 = [test_pids]
for i in range(1, 12):
    clf = RandomForestClassifier(min_samples_leaf=2, class_weight='balanced', oob_score=False, bootstrap=False)
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    print(clf.oob_score)
    # compute probabilites as opposed to predictions
    probability = clf.apply(data_test_scaled)
    probs = [i[1] for i in probability] 
    columns_1.append(probs)
    
    
    probability = clf.predict_proba(data_train_scaled)

    probs = [i[1] for i in probability]            
    tmp = roc_auc_score(y_score= probs, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)

# Compute the kernel and use SGD Classifier and Regressor

In [122]:
# first for the labels that have an output [0,1]
test_pids = list(set(df_test_features.pid))
columns_1 = [test_pids]

# from sklearn.kernel_ridge import KernelRidge
from sklearn.kernel_approximation import Nystroem
from sklearn import linear_model

from sklearn.utils import resample
# train_data = 

for i in range(1, 12):
   
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_classif, mode ='k_best', param=70)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print("For feature ", df_train_labels.columns[i])
#     print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)

    
    feature_map_nystroem = Nystroem(kernel = 'rbf',
                                 random_state=1,
                                 n_components=300)
    train_transformed = feature_map_nystroem.fit_transform(train_features)
    test_transformed = feature_map_nystroem.transform(test_features)
    
    clf_w = linear_model.SGDClassifier(max_iter=100000, tol=1e-4, penalty = "l2", 
                                       loss = "epsilon_insensitive", class_weight='balanced')
    # checked before
    #parameters = {'alpha':(0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 30)}
    parameters = {'alpha':(0.1, 1, 5, 10)}
    
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 10,
                                        refit = True, scoring = 'roc_auc', verbose = 1,
                                       n_jobs=6, return_train_score = True)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
#     print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    # compute probabilites as opposed to predictions
    #dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    
    distance_hyperplane = clf.decision_function(test_features)
    probability = np.empty(len(distance_hyperplane))
    for j in range(0, len(probability)):
        if distance_hyperplane[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplane[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplane[j]))
    columns_1.append(probability)

    
    distance_hyperplace_train = clf.decision_function(train_features)
    probability = np.empty(len(distance_hyperplace_train))
    for j in range(0, len(probability)):
        if distance_hyperplace_train[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplace_train[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplace_train[j]))
    
    tmp = roc_auc_score(y_score= probability, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)
    

For feature  LABEL_BaseExcess
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    2.6s finished


{'alpha': 0.1}
0.7692051103610277
ROC AUC for feature LABEL_BaseExcess  :  0.7726743298225831
For feature  LABEL_Fibrinogen
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    3.1s remaining:    1.2s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    9.8s finished


{'alpha': 0.1}
0.707961517252645
ROC AUC for feature LABEL_Fibrinogen  :  0.7192926561929119
For feature  LABEL_AST
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    1.4s remaining:    0.5s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    2.4s finished


{'alpha': 0.1}
0.6693434608965193
ROC AUC for feature LABEL_AST  :  0.6742440588675493
For feature  LABEL_Alkalinephos
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    1.5s finished


{'alpha': 0.1}
0.6730674571262701
ROC AUC for feature LABEL_Alkalinephos  :  0.6786453946637129
For feature  LABEL_Bilirubin_total
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    2.1s finished


{'alpha': 0.1}
0.6710610503604066
ROC AUC for feature LABEL_Bilirubin_total  :  0.6758456059979748
For feature  LABEL_Lactate
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    1.3s finished


{'alpha': 0.1}
0.7042422493797398
ROC AUC for feature LABEL_Lactate  :  0.7088582646637026
For feature  LABEL_TroponinI
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    1.2s remaining:    0.5s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    1.3s finished


{'alpha': 0.1}
0.7028333520611143
ROC AUC for feature LABEL_TroponinI  :  0.7122935703374531
For feature  LABEL_SaO2
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    1.9s finished


{'alpha': 0.1}
0.7350746791107732
ROC AUC for feature LABEL_SaO2  :  0.7387356911226971
For feature  LABEL_Bilirubin_direct
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    3.2s remaining:    1.2s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:   12.6s finished


{'alpha': 0.1}
0.6871079796595855
ROC AUC for feature LABEL_Bilirubin_direct  :  0.7121238506135195
For feature  LABEL_EtCO2
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    0.9s remaining:    0.3s
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    4.7s finished


{'alpha': 0.1}
0.79298038633303
ROC AUC for feature LABEL_EtCO2  :  0.7995151662585706
For feature  LABEL_Sepsis
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 out of  40 | elapsed:    1.0s remaining:    0.4s


{'alpha': 0.1}
0.6316632039027443
ROC AUC for feature LABEL_Sepsis  :  0.6585471011993339


[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:    4.6s finished


In [123]:
# labels that have a real value
columns_2 = []

for i in range(12, 16):
    # feature selection
#     transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param = 80)
#     train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
# #     print(df_train_agg_features.columns[transformer.get_support(indices = True)])
#     test_features = transformer.transform(data_test_scaled)

    train_features = data_train_scaled
    test_features = data_test_scaled
    
    feature_map_nystroem = Nystroem(kernel = 'rbf',
                                 random_state=1,
                                 n_components=300)
    train_transformed = feature_map_nystroem.fit_transform(train_features)
    test_transformed = feature_map_nystroem.transform(test_features)
    
    clf_w = linear_model.SGDRegressor(max_iter=100000, tol=1e-4,
                                     loss = 'epsilon_insensitive', penalty = 'l2')
    parameters = {'alpha':(0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 30)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 10,
                                       refit = True, scoring = 'r2', verbose = 1, n_jobs=6)
#     clf = KernelRidge(kernel = 'poly', degree = 5)
#     parameters = {'alpha':(0.1,1,10,30)}
#     clf = model_selection.GridSearchCV(estimator= clf, param_grid = parameters, cv = 3,
#                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
#     print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)

    pred_train = clf.predict(train_features)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)
    
    pred = clf.predict(test_features)
    columns_2.append(pred)    

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  64 tasks      | elapsed:    1.6s
[Parallel(n_jobs=6)]: Done  79 out of  90 | elapsed:    1.9s remaining:    0.3s
[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:    2.0s finished


{'alpha': 0.001}
0.38525838303356136
R2 for feature LABEL_RRate  :  0.3932819813363555
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    3.0s
[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:    4.6s finished


{'alpha': 0.001}
0.5914401166053944
R2 for feature LABEL_ABPm  :  0.6013605375070599
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    1.7s
[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:    2.9s finished


{'alpha': 0.1}
0.36153348125006585
R2 for feature LABEL_SpO2  :  0.3767696134465238
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    2.5s


{'alpha': 0.01}
0.6173782261689954
R2 for feature LABEL_Heartrate  :  0.6195705679916895


[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:    4.2s finished


In [124]:
columns_final = columns_1 + columns_2

# XGBoosting

The benefit of XGBoost is that it accepts missing values.

## Save predictions

In [125]:
print(np.shape(columns_final))
result = pd.DataFrame(columns_final).transpose()
result.columns = list(df_train_labels)
result.to_csv('./Results/prediction.csv.zip', index=False, float_format='%.3f', compression='zip')

(16, 12664)


In [None]:
result.to_csv('./Results/prediction.csv', index=False, float_format='%.3f')