# Support vector machines

In [None]:
# import libraries

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages
from sklearn.decomposition import PCA

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_classif, mutual_info_regression, f_regression

from sklearn import preprocessing
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import pairwise_kernels

from sklearn import model_selection

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

## Data pre-processing

In [None]:
# load training data

# load data from csv file
df_train_features = pd.read_csv ('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')

# Load test data
df_test_features = pd.read_csv ('test_features.csv')

### Sorting labels

In [None]:
df_train_labels = df_train_labels.sort_values(by = 'pid')
df_train_features = df_train_features.sort_values(by = 'pid')

 ### Histogram of the output labels 

We should check for class imbalance.

In [None]:
df_train_labels.hist()

# with PdfPages("./Results/Labels_histogram.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

One can see the class imbalance problem here. Other observations:
  * Heartrate, RRate, ABPm,  distribution is similar to a normal distribution
  * SpO2 is like a censored normal distribution. 
  * For all of the other features, class imbalance is an obvious problem.

A basic strategy that could be used here: Upsample both classes! Do the upsampling efficiently, not just replicating the datapoints

### Train Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_train_features.agg([min, max]))

# Boxplotting the data
# fig2, ax2 = plt.subplots()
# ax2.set_title('BUN')
# ax2.boxplot(df_train_features.iloc[:,5], notch=True)

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = df_train_features.iloc[:,1:])
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=90,
    horizontalalignment='right'
);

# with PdfPages("./Results/Train_columns_boxplot.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

In [None]:
# calculate the correlation matrix
corr = df_train_features.corr()

# plot the heatmap
plt.figure(figsize=(16, 16))
ax = sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, 
        vmin=-1, vmax=1, center=0, 
           cmap=sns.diverging_palette(20, 220, n=200))
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

### Visualizing pattern of missing values

In [None]:
# how much missing data? 
print("Percentage of missing values:")
print(df_train_features.isnull().sum(axis=0) / len(df_train_features))

msno.matrix(df_train_features)

# Plotting the correlation between the missing values
msno.heatmap(df_train_features)

### Train data pre-processing

In [None]:
df_train_agg_features = df_train_features.groupby('pid').agg([np.min, np.max, np.mean])
df_train_agg_features = df_train_agg_features.iloc[:,5:]
# Removing ETCo2 mean and max since it has so many NA
df_train_agg_features = df_train_agg_features.drop(df_train_agg_features.columns[[2,3]],  axis = 1)
print(df_train_agg_features.columns)
df_train_agg_features.columns
print(int(df_train_agg_features.shape[1]))
print(int(df_train_agg_features.shape[1]/3))

# how much missing data? 
print("number of missing values:")
print(df_train_agg_features.isnull().sum(axis=0))

na_percent_max = int(0.8 * df_train_agg_features.shape[0])
tmp = pd.DataFrame(df_train_agg_features)
for i in range(1, (int(df_train_agg_features.shape[1]/3))):
    na_count = df_train_agg_features.iloc[:,i].isna().sum()
    print(df_train_agg_features.columns[i])
    print(na_count)
    
    if(na_count > na_percent_max):
        print("should be removed")


In [None]:
# impute missing data points
#imp = SimpleImputer(strategy="mean")
imputer = KNNImputer(n_neighbors = 10)
#imputer = IterativeImputer(random_state=0, verbose = 2, max_iter = 30)
df_train_agg_imputed_features = imputer.fit_transform(df_train_agg_features)
#print(df_train_agg_imputed_features)

In [None]:
# scale the data
min_max_scaler = preprocessing.StandardScaler()
# standard_scalar = preprocessing.StandardScaler()

data_train_scaled = min_max_scaler.fit_transform(df_train_agg_imputed_features)

In [None]:
# REARRANGE THE LABELS, TO MATCH THE REARRANGED FEATURES
df_train_labels_sorted = df_train_labels.sort_values(by = 'pid')
print(df_train_labels_sorted[['pid']])
print(df_train_labels[['pid']])
print(df_train_agg_features)

In [None]:
# Visualizing the training data after imputing and aggregating

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = pd.DataFrame(data_train_scaled))
ax.set_xticklabels(
    list(df_train_features),
    rotation=90,
    horizontalalignment='right'
);

In [None]:
# What is the correlation between the 
pd.DataFrame(data_train_scaled).corrwith(other = pd.DataFrame(df_train_agg_imputed_features), method = "spearman").transpose()

### PCA plot 

In [None]:
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(data_train_scaled)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, df_train_labels[[df_train_labels.columns[11]]]], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA for label i', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf[df_train_labels.columns[11]] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### Test Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_test_features.agg([min, max]))

# how much missing data? 
print("number of missing values:")
print(df_test_features.isnull().sum(axis=0))

In [None]:
# # aggregate data for each pid
# df_test_aggregate_features = df_test_features.groupby('pid').agg('median')

df_test_agg_features = df_test_features.groupby('pid').agg([np.min, np.max, np.mean])

df_test_agg_features = df_test_agg_features.iloc[:,5:]
# Removing ETCo2 mean and max since it has so many NA
df_test_agg_features = df_test_agg_features.drop(df_test_agg_features.columns[[2,3]],  axis = 1)

In [None]:
# impute missing data points
# should we impute it with the same imputer that we've used for train?

imputer = KNNImputer(n_neighbors= 10)
#imputer = IterativeImputer(random_state=0, verbose = 1)
df_test_agg_imputed_features = imputer.fit_transform(df_test_agg_features)

In [None]:
# scale test data
min_max_scaler = preprocessing.StandardScaler()
data_test_scaled = min_max_scaler.fit_transform(df_test_agg_imputed_features)

In [None]:
# pd.DataFrame(data_train_scaled).to_csv("./Results/4stats_iterarive_dat_train_scaled.csv")
# pd.DataFrame(data_test_scaled).to_csv("./Results/4stats_iterative_dat_test_scaled.csv")

## Fit a model & Predict

### predict with support vector machine classification and use probabilities

In [29]:
# first for the labels that have an output [0,1]
test_pids = list(set(df_test_features.pid))
columns_1 = [test_pids]

for i in range(1, 12):
   
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_classif, mode ='k_best', param=40)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print("For feature ", df_train_labels.columns[i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)

    
    #clf = BaggingClassifier(SVC(kernel = 'poly', degree = 5, class_weight = 'balanced', verbose = True, C = 10))
    clf_w = SVC(kernel = 'poly', degree = 3, class_weight = 'balanced', verbose = 2)
    
    parameters = {'C':(0.1, 1, 5, 10)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                        refit = True, scoring = 'roc_auc', verbose = 2,
                                       n_jobs=6, return_train_score = True)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    # compute probabilites as opposed to predictions
    #dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    
    distance_hyperplane = clf.decision_function(test_features)
    probability = np.empty(len(distance_hyperplane))
    for j in range(0, len(probability)):
        if distance_hyperplane[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplane[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplane[j]))
    columns_1.append(probability)

    
    distance_hyperplace_train = clf.decision_function(train_features)
    probability = np.empty(len(distance_hyperplace_train))
    for j in range(0, len(probability)):
        if distance_hyperplace_train[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplace_train[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplace_train[j]))
    
    tmp = roc_auc_score(y_score= probability, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)
    

For feature  LABEL_BaseExcess
MultiIndex([(       'PTT', 'amin'),
            (       'PTT', 'amax'),
            (       'PTT', 'mean'),
            (   'Lactate', 'amin'),
            (   'Lactate', 'amax'),
            (   'Lactate', 'mean'),
            (      'Temp', 'mean'),
            (       'Hgb', 'mean'),
            (      'HCO3', 'amin'),
            (      'HCO3', 'amax'),
            (      'HCO3', 'mean'),
            ('BaseExcess', 'amin'),
            ('BaseExcess', 'amax'),
            ('BaseExcess', 'mean'),
            (     'RRate', 'mean'),
            ('Creatinine', 'amin'),
            ('Creatinine', 'amax'),
            ('Creatinine', 'mean'),
            (     'PaCO2', 'amin'),
            (     'PaCO2', 'amax'),
            (     'PaCO2', 'mean'),
            (      'FiO2', 'amin'),
            (      'FiO2', 'amax'),
            (      'FiO2', 'mean'),
            (      'SaO2', 'amin'),
            (      'SaO2', 'amax'),
            (      'SaO2', 'mean')

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:  2.1min remaining:    0.0s


KeyboardInterrupt: 

In [16]:
# labels that have a real value
columns_2 = []
# from sklearn.kernel_ridge import KernelRidge

for i in range(12, 16):
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param = 50)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)
    
    clf_w = SVR(kernel = 'rbf', gamma = 'scale', cache_size = 6000)
# #     clf_w = NuSVR(nu=0.5, kernel = 'linear')
    parameters = {'C':(0.1, 1,10)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
#     clf = KernelRidge(kernel = 'poly', degree = 5)
#     parameters = {'alpha':(0.1,1,10,30)}
#     clf = model_selection.GridSearchCV(estimator= clf, param_grid = parameters, cv = 3,
#                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)

    pred_train = clf.predict(train_features)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)
    
    pred = clf.predict(test_features)
    columns_2.append(pred)
    

{'mean_fit_time': array([22.40885172, 22.84809146, 21.25310478]), 'std_fit_time': array([0.06816745, 0.13452873, 3.31278865]), 'mean_score_time': array([4.55382714, 4.49709716, 3.69804029]), 'std_score_time': array([0.01893634, 0.04048154, 0.52262203]), 'param_C': masked_array(data=[0.1, 1, 10],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.1}, {'C': 1}, {'C': 10}], 'split0_test_score': array([0.52470667, 0.57949584, 0.60073112]), 'split1_test_score': array([0.51697736, 0.57453686, 0.60189326]), 'split2_test_score': array([0.53313793, 0.58945065, 0.60719138]), 'split3_test_score': array([0.52378756, 0.58424314, 0.61198652]), 'split4_test_score': array([0.51733216, 0.56972221, 0.58543835]), 'mean_test_score': array([0.52318833, 0.57948974, 0.60144813]), 'std_test_score': array([0.00590813, 0.00695342, 0.0089562 ]), 'rank_test_score': array([3, 2, 1], dtype=int32)}
{'C': 10}
0.6014481277020215
R2 for feature LABEL_Heartrate 

In [17]:
columns_final = columns_1 + columns_2

### predict with Support vector regression and then compute sigmoid function

In [None]:
# first for the labels that have an output [0,1]

# columns_1 = [test_pids]

# for i in range(1,12):
    
#     clf = SVR(kernel = 'poly', degree = 3, max_iter = 10000)
#     clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
#     pred = clf.predict(data_test_scaled)
#     prob = np.empty(len(pred))
#     for j in range(0, len(pred)):
#         prob[j] = 1 / (1 + math.exp(-pred[j]))
#     columns_1.append(prob)
    
#     pred_train = clf.predict(data_train_scaled)
#     prob_train = np.empty(len(pred_train))
#     for j in range(0, len(pred_train)):
#         prob_train[j] = 1 / (1 + math.exp(-pred_train[j]))    
#     tmp = roc_auc_score(y_score= prob_train, y_true= df_train_labels.iloc[:,i])
#     print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)


In [None]:
# #labels that have a real value

# columns_2 = []

# for i in range(12, 16):
    
#     # feature selection
#     transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param=20)
#     train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
#     print(list(data_train_scaled)[transformer.get_support()])
#     test_features = transformer.transform(data_test_scaled)
    

#     clf_w = LinearSVR()
#     parameters = {'C':(0.1,1,10,30,60,100)}
#     clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 2,
#                                        refit = True, scoring = 'r2', verbose = 1, n_jobs=6)
#     clf.fit(train_features, df_train_labels.iloc[:,i])
    
#     print(clf.cv_results_)
#     pred = clf.predict(test_features)
#     columns_2.append(pred)
    
#     pred_train = clf.predict(train_features)
#     tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
#     print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)

In [None]:
transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param=20)
train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,11])
test_features = transformer.transform(data_test_scaled)

In [None]:
df_train_agg_features.columns[transformer.get_support(indices = True)]

In [27]:
columns_final = columns_1 + columns_2

### Random forest

In [None]:
# Random forest Classifier
columns_1 = [test_pids]
for i in range(1, 12):
    clf = RandomForestClassifier(min_samples_leaf=2, class_weight='balanced', oob_score=False, bootstrap=False)
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    print(clf.oob_score)
    # compute probabilites as opposed to predictions
    probability = clf.apply(data_test_scaled)
    probs = [i[1] for i in probability] 
    columns_1.append(probs)
    
    
    probability = clf.predict_proba(data_train_scaled)

    probs = [i[1] for i in probability]            
    tmp = roc_auc_score(y_score= probs, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)

### Compute the kernel and use SGD Regressor

In [None]:
# first for the labels that have an output [0,1]
test_pids = list(set(df_test_features.pid))
columns_1 = [test_pids]

# from sklearn.kernel_ridge import KernelRidge
from sklearn.kernel_approximation import Nystroem
from sklearn import linear_model

for i in range(1, 12):
   
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_classif, mode ='k_best', param=40)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print("For feature ", df_train_labels.columns[i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)

    
    feature_map_nystroem = Nystroem(kernel = 'rbf',
                                 random_state=1,
                                 n_components=100)
    train_transformed = feature_map_nystroem.fit_transform(train_features)
    test_transformed = feature_map_nystroem.transform(test_features)
    
    clf_w = linear_model.SGDClassifier(max_iter=100000, tol=1e-4,
                                     loss = 'epsilon_insensitive', penalty = 'l2')
    
    parameters = {'alpha':(0.01, 0.1, 1, 10, 100)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                        refit = True, scoring = 'roc_auc', verbose = 2,
                                       n_jobs=6, return_train_score = True)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)
    # compute probabilites as opposed to predictions
    #dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    
    distance_hyperplane = clf.decision_function(test_features)
    probability = np.empty(len(distance_hyperplane))
    for j in range(0, len(probability)):
        if distance_hyperplane[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplane[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplane[j]))
    columns_1.append(probability)

    
    distance_hyperplace_train = clf.decision_function(train_features)
    probability = np.empty(len(distance_hyperplace_train))
    for j in range(0, len(probability)):
        if distance_hyperplace_train[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplace_train[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplace_train[j]))
    
    tmp = roc_auc_score(y_score= probability, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)
    

For feature  LABEL_BaseExcess
MultiIndex([(       'PTT', 'amin'),
            (       'PTT', 'amax'),
            (       'PTT', 'mean'),
            (   'Lactate', 'amin'),
            (   'Lactate', 'amax'),
            (   'Lactate', 'mean'),
            (      'Temp', 'mean'),
            (      'HCO3', 'amin'),
            (      'HCO3', 'amax'),
            (      'HCO3', 'mean'),
            ('BaseExcess', 'amin'),
            ('BaseExcess', 'amax'),
            ('BaseExcess', 'mean'),
            (     'RRate', 'mean'),
            ('Creatinine', 'amin'),
            ('Creatinine', 'amax'),
            ('Creatinine', 'mean'),
            (     'PaCO2', 'amin'),
            (     'PaCO2', 'amax'),
            (     'PaCO2', 'mean'),
            (      'FiO2', 'amin'),
            (      'FiO2', 'amax'),
            (      'FiO2', 'mean'),
            (      'SaO2', 'amin'),
            (      'SaO2', 'amax'),
            (      'SaO2', 'mean'),
            (      'ABPm', 'mean')

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  14 out of  25 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=6)]: Done  25 out of  25 | elapsed:    0.9s finished


{'mean_fit_time': array([0.11057463, 0.07374015, 0.03751969, 0.25193596, 0.09882545]), 'std_fit_time': array([0.0212206 , 0.01043428, 0.00273282, 0.26721602, 0.01219469]), 'mean_score_time': array([0.00421042, 0.00273509, 0.00318069, 0.00328465, 0.00259871]), 'std_score_time': array([0.00108714, 0.00036793, 0.00073961, 0.00147451, 0.00041033]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}], 'split0_test_score': array([0.70357478, 0.70297928, 0.7140497 , 0.67020546, 0.66995745]), 'split1_test_score': array([0.70395719, 0.70355653, 0.71282644, 0.71324616, 0.66275019]), 'split2_test_score': array([0.71137806, 0.71463771, 0.72318679, 0.72180407, 0.68508624]), 'split3_test_score': array([0.70147662, 0.71040553, 0.7170685 , 0.66053014, 0.66038612]), 'split4_test_score': array([0.6856454 , 

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  14 out of  25 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=6)]: Done  25 out of  25 | elapsed:    1.9s finished


{'mean_fit_time': array([0.10159826, 0.04708314, 0.03299627, 0.34768395, 0.2233634 ]), 'std_fit_time': array([0.00504395, 0.00193413, 0.00361055, 0.64637971, 0.02210923]), 'mean_score_time': array([0.00320454, 0.00316663, 0.00269694, 0.00245242, 0.00243545]), 'std_score_time': array([0.00061056, 0.00057083, 0.00034025, 0.00041285, 0.00013181]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}], 'split0_test_score': array([0.59279422, 0.55817704, 0.60573113, 0.68418889, 0.67857955]), 'split1_test_score': array([0.65899403, 0.64241363, 0.6085901 , 0.69960013, 0.70004263]), 'split2_test_score': array([0.62161024, 0.58402955, 0.60961718, 0.62044615, 0.67995372]), 'split3_test_score': array([0.63193683, 0.63661044, 0.64354423, 0.70318171, 0.70337657]), 'split4_test_score': array([0.58730971, 

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  14 out of  25 | elapsed:    0.4s remaining:    0.3s
[Parallel(n_jobs=6)]: Done  25 out of  25 | elapsed:    1.2s finished


{'mean_fit_time': array([0.11955657, 0.07269711, 0.0478869 , 0.35077491, 0.134904  ]), 'std_fit_time': array([0.01193395, 0.01162845, 0.00601202, 0.31876596, 0.01772405]), 'mean_score_time': array([0.00303698, 0.00428457, 0.00374098, 0.00277114, 0.00318027]), 'std_score_time': array([0.00012098, 0.00110027, 0.00051542, 0.00080789, 0.00062888]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}], 'split0_test_score': array([0.62039148, 0.63040864, 0.62641737, 0.66171876, 0.66108772]), 'split1_test_score': array([0.63890781, 0.63177583, 0.62912548, 0.6243949 , 0.6490193 ]), 'split2_test_score': array([0.6328397 , 0.61266386, 0.61157566, 0.61849973, 0.64648791]), 'split3_test_score': array([0.59760248, 0.61252132, 0.61047303, 0.6127566 , 0.63396438]), 'split4_test_score': array([0.650174  , 

In [26]:
# labels that have a real value
columns_2 = []

for i in range(12, 16):
    # feature selection
    transformer =  GenericUnivariateSelect(score_func= mutual_info_regression, mode ='k_best', param = 30)
    train_features = transformer.fit_transform(data_train_scaled, df_train_labels.iloc[:,i])
    print(df_train_agg_features.columns[transformer.get_support(indices = True)])
    test_features = transformer.transform(data_test_scaled)
    
    feature_map_nystroem = Nystroem(kernel = 'rbf', degree = 5,
                                 random_state=1,
                                 n_components=100)
    train_transformed = feature_map_nystroem.fit_transform(train_features)
    test_transformed = feature_map_nystroem.transform(test_features)
    
    clf_w = linear_model.SGDRegressor(max_iter=100000, tol=1e-4,
                                     loss = 'epsilon_insensitive', penalty = 'l2',
                                     validation_fraction = 0.2, l1_ratio= 0.3)
# #     clf_w = NuSVR(nu=0.5, kernel = 'linear')
    parameters = {'alpha':(0.01, 0.1, 1,10, 100, 1000, 10000)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 10,
                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
#     clf = KernelRidge(kernel = 'poly', degree = 5)
#     parameters = {'alpha':(0.1,1,10,30)}
#     clf = model_selection.GridSearchCV(estimator= clf, param_grid = parameters, cv = 3,
#                                       refit = True, scoring = 'r2', verbose = 2, n_jobs=6)
    clf.fit(train_features, df_train_labels.iloc[:,i])
    
    print(clf.cv_results_)
    print(clf.best_params_)
    print(clf.best_score_)

    pred_train = clf.predict(train_features)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)
    
    pred = clf.predict(test_features)
    columns_2.append(pred)    

MultiIndex([(     'Lactate', 'amin'),
            (        'Temp', 'amin'),
            (        'Temp', 'amax'),
            (        'HCO3', 'amin'),
            (  'BaseExcess', 'amin'),
            (       'RRate', 'amin'),
            (       'RRate', 'amax'),
            (       'RRate', 'mean'),
            (  'Fibrinogen', 'amin'),
            (  'Fibrinogen', 'amax'),
            (  'Fibrinogen', 'mean'),
            (       'PaCO2', 'amin'),
            (        'FiO2', 'amin'),
            (        'FiO2', 'amax'),
            (        'FiO2', 'mean'),
            (   'Magnesium', 'amin'),
            (   'Magnesium', 'mean'),
            (     'Calcium', 'amin'),
            ('Alkalinephos', 'mean'),
            (        'SpO2', 'amin'),
            (        'SpO2', 'mean'),
            (    'Chloride', 'amin'),
            (    'Chloride', 'amax'),
            (   'Heartrate', 'amin'),
            (   'Heartrate', 'amax'),
            (   'Heartrate', 'mean'),
            

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  50 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done  70 out of  70 | elapsed:    0.6s finished


{'mean_fit_time': array([0.03164723, 0.03072698, 0.05869274, 0.05405605, 0.04510288,
       0.0474793 , 0.03472314]), 'std_fit_time': array([0.01118847, 0.01482904, 0.0116606 , 0.01708059, 0.01279342,
       0.01319988, 0.00326497]), 'mean_score_time': array([0.0007097 , 0.00060625, 0.00066772, 0.00063264, 0.00067141,
       0.00070527, 0.00071063]), 'std_score_time': array([8.76955293e-05, 9.03128067e-05, 7.94591717e-05, 7.54557137e-05,
       1.07327584e-04, 9.23153813e-05, 8.62040083e-05]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100, 1000, 10000],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}, {'alpha': 1000}, {'alpha': 10000}], 'split0_test_score': array([-1.53790813e+13, -1.95033267e+09,  3.87315755e-01,  2.06086662e-01,
        5.65430994e-02,  3.80195060e-04,  4.60773469e-04]), 'split1_test_score': array([-1

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  46 tasks      | elapsed:    0.8s
[Parallel(n_jobs=6)]: Done  70 out of  70 | elapsed:    1.0s finished


{'mean_fit_time': array([0.1030911 , 0.1020493 , 0.10913539, 0.06844926, 0.06103232,
       0.05136783, 0.03195622]), 'std_fit_time': array([0.02177962, 0.02955477, 0.03738074, 0.01775049, 0.02123151,
       0.01740126, 0.00352024]), 'mean_score_time': array([0.0008517 , 0.00084929, 0.00074978, 0.00070148, 0.00073855,
       0.00071838, 0.00060842]), 'std_score_time': array([1.54247241e-04, 3.40508657e-04, 8.81228611e-05, 8.16952177e-05,
       7.62604886e-05, 7.05438192e-05, 9.74766685e-05]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100, 1000, 10000],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}, {'alpha': 1000}, {'alpha': 10000}], 'split0_test_score': array([0.58864045, 0.58286013, 0.55660719, 0.46298718, 0.07109002,
       0.0122632 , 0.00217022]), 'split1_test_score': array([ 0.59146645,  0.5779377 ,  0.56991747

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  53 tasks      | elapsed:    0.5s
[Parallel(n_jobs=6)]: Done  59 out of  70 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs=6)]: Done  70 out of  70 | elapsed:    0.7s finished


{'mean_fit_time': array([0.03350186, 0.03334141, 0.05026207, 0.04324548, 0.05405951,
       0.04324682, 0.04234033]), 'std_fit_time': array([0.01678748, 0.01071565, 0.01365156, 0.01457523, 0.01845607,
       0.00885837, 0.00784867]), 'mean_score_time': array([0.00068893, 0.00064762, 0.0006433 , 0.00068066, 0.00064712,
       0.00076618, 0.00077517]), 'std_score_time': array([1.09856503e-04, 1.01461718e-04, 9.32171399e-05, 8.44263350e-05,
       9.60353351e-05, 2.20714554e-04, 1.76662602e-04]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100, 1000, 10000],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}, {'alpha': 1000}, {'alpha': 10000}], 'split0_test_score': array([-2.99789261e+11, -1.59770784e+07,  3.36041704e-01,  1.26520633e-01,
        6.31243596e-03,  9.76317922e-04, -1.20749931e-03]), 'split1_test_score': array([-1

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  46 tasks      | elapsed:    0.7s
[Parallel(n_jobs=6)]: Done  59 out of  70 | elapsed:    0.9s remaining:    0.2s
[Parallel(n_jobs=6)]: Done  70 out of  70 | elapsed:    0.9s finished


{'mean_fit_time': array([0.10075295, 0.07536445, 0.08998957, 0.05955398, 0.04288311,
       0.04519734, 0.03126996]), 'std_fit_time': array([0.02017355, 0.01893636, 0.03052114, 0.01748754, 0.00618942,
       0.01140093, 0.00336305]), 'mean_score_time': array([0.00097251, 0.00076177, 0.00089819, 0.00095522, 0.0006634 ,
       0.00062163, 0.00059776]), 'std_score_time': array([3.21707157e-04, 2.95204074e-04, 1.99000950e-04, 6.13631049e-04,
       7.33118522e-05, 6.97374996e-05, 1.08064115e-04]), 'param_alpha': masked_array(data=[0.01, 0.1, 1, 10, 100, 1000, 10000],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 100}, {'alpha': 1000}, {'alpha': 10000}], 'split0_test_score': array([0.6255956 , 0.61474629, 0.59188106, 0.39540125, 0.06063833,
       0.00776192, 0.02893701]), 'split1_test_score': array([0.61630002, 0.60649943, 0.58182793, 0

## Save predictions

In [28]:
print(np.shape(columns_final))
result = pd.DataFrame(columns_final).transpose()
result.columns = list(df_train_labels)
result.to_csv('./Results/prediction.csv.zip', index=False, float_format='%.3f', compression='zip')

(16, 12664)


In [None]:
result.to_csv('./Results/prediction.csv', index=False, float_format='%.3f')