# Support vector machines

In [56]:
# import libraries

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages
from sklearn.decomposition import PCA

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import pairwise_kernels

from sklearn import model_selection

from sklearn.impute import KNNImputer

## Data pre-processing

In [2]:
# load training data

# load data from csv file
df_train_features = pd.read_csv ('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')

# Load test data
df_test_features = pd.read_csv ('test_features.csv')

 ### Histogram of the output labels 

We should check for class imbalance.

In [None]:
df_train_labels.hist()

# with PdfPages("./Results/Labels_histogram.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

One can see the class imbalance problem here. Other observations:
  * Heartrate, RRate, ABPm,  distribution is similar to a normal distribution
  * SpO2 is like a censored normal distribution. 
  * For all of the other features, class imbalance is an obvious problem.

A basic strategy that could be used here: Upsample both classes! Do the upsampling efficiently, not just replicating the datapoints

### Train Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_train_features.agg([min, max]))

# Boxplotting the data
# fig2, ax2 = plt.subplots()
# ax2.set_title('BUN')
# ax2.boxplot(df_train_features.iloc[:,5], notch=True)

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = df_train_features.iloc[:,1:])
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=90,
    horizontalalignment='right'
);

# with PdfPages("./Results/Train_columns_boxplot.pdf") as export_pdf:
#     for i in list(df_train_labels)[1:]:
#         df_train_labels.hist(column = i, bins = 100)
#         export_pdf.savefig()

In [None]:
# calculate the correlation matrix
corr = df_train_features.corr()

# plot the heatmap
plt.figure(figsize=(16, 16))
ax = sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, 
        vmin=-1, vmax=1, center=0, 
           cmap=sns.diverging_palette(20, 220, n=200))
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

### Visualizing pattern of missing values

In [None]:
# how much missing data? 
print("Percentage of missing values:")
print(df_train_features.isnull().sum(axis=0) / len(df_train_features))

msno.matrix(df_train_features)

# Plotting the correlation between the missing values
msno.heatmap(df_train_features)

### Train data pre-processing

In [3]:
# Patient by patient pre-processing for imputation and feature generation

# get variables from train features
variables = df_train_features.columns[2:]

# get all pids, to impute and pre-process individually
train_pids = df_train_features['pid'].unique()

for pid in train_pids:
    # dataframe for this pid
    df = df_train_features.loc[df_train_features['pid'] == pid]
    
    for var in variables:
        # how many NaN are in there for this variable
        data = df[var]
        num_nan = data.isnull().sum(axis=0)
        if num_nan == 12:
            continue
        
        data = data.to_numpy()
           
        imp = SimpleImputer(missing_values = np.nan)
        train_imputed = imp.fit_transform(data.reshape(-1, 1))
    
        df_train_features.loc[df_train_features['pid'] == pid,var] = train_imputed


In [4]:
# aggregate the time series
data_array = np.empty([len(train_pids), len(variables)])
i = 0
j = 0

for pid in train_pids:
    
    df = df_train_features.loc[df_train_features['pid'] == pid]
    j = 0
    
    for var in variables:
        # how many NaN are in there for this variable
        data = df[var]
        num_nan = data.isnull().sum(axis=0)
        if num_nan == 12:
            data_array[i, j] = np.nan
            j = j + 1
            continue
        data = data.to_numpy()
        data_array[i, j] = np.mean(data)
        j = j + 1 
        
    i = i +1

In [5]:
# impute missing data points
#imp = SimpleImputer(strategy="mean")
imputer = KNNImputer(n_neighbors=5)
df_train_agg_imputed_features = imputer.fit_transform(data_array)
#print(df_train_agg_imputed_features)

In [6]:
# scale the data
min_max_scaler = preprocessing.StandardScaler()
# standard_scalar = preprocessing.StandardScaler()

data_train_scaled = min_max_scaler.fit_transform(df_train_agg_imputed_features)

In [7]:
# REARRANGE THE LABELS, TO MATCH THE REARRANGED FEATURES
df_train_labels_sorted = df_train_labels.sort_values(by = 'pid')
# print(df_train_labels_sorted)

In [None]:
# Visualizing the training data after imputing and aggregating

plt.figure(figsize=(16, 16))
ax = sns.boxplot(data = pd.DataFrame(data_train_scaled))
ax.set_xticklabels(
    list(df_train_features),
    rotation=90,
    horizontalalignment='right'
);

In [None]:
# What is the correlation between the 
pd.DataFrame(data_train_scaled).corrwith(other = pd.DataFrame(df_train_agg_imputed_features), method = "spearman").transpose()

### PCA plot 

In [None]:
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(data_train_scaled)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, df_train_labels[['LABEL_BaseExcess']]], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA for LABEL_BaseExcess', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['LABEL_BaseExcess'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### Test Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_test_features.agg([min, max]))

# how much missing data? 
print("number of missing values:")
print(df_test_features.isnull().sum(axis=0))

In [8]:
# # aggregate data for each pid
# df_test_aggregate_features = df_test_features.groupby('pid').agg('median')

# #print(df_test_aggregate_features)

# # collect all test pids
test_pids = list(set(df_test_features.pid))

In [9]:
# Patient by patient pre-processing for imputation and feature generation

# get variables from train features
variables_test = df_test_features.columns[2:]

# get all pids, to impute and pre-process individually
test_pids = list(set(df_test_features.pid))

for pid in test_pids:
    # dataframe for this pid
    df = df_test_features.loc[df_test_features['pid'] == pid]
    
    for var in variables_test:
        # how many NaN are in there for this variable
        data = df[var]
        num_nan = data.isnull().sum(axis=0)
        if num_nan == 12:
            continue
        
        data = data.to_numpy()
           
        imp = SimpleImputer(missing_values = np.nan)
        test_imputed = imp.fit_transform(data.reshape(-1, 1))
    
        df_test_features.loc[df_test_features['pid'] == pid,var] = test_imputed

# aggregate the time series
data_array = np.empty([len(test_pids), len(variables_test)])
i = 0
j = 0

for pid in test_pids:
    
    df = df_test_features.loc[df_test_features['pid'] == pid]
    j = 0
    
    for var in variables:
        # how many NaN are in there for this variable
        data = df[var]
        num_nan = data.isnull().sum(axis=0)
        if num_nan == 12:
            data_array[i, j] = np.nan
            j = j + 1
            continue
        data = data.to_numpy()
        data_array[i, j] = np.mean(data)
        j = j + 1 
        
    i = i +1

In [12]:
# # remove time from data frame 
# df_test_agg_features = df_test_aggregate_features.drop(['Time'], axis = 1)
print(df_test_features)

         pid  Time   Age  EtCO2    PTT   BUN  Lactate   Temp    Hgb  HCO3  \
0          0     1  39.0    NaN  41.35  18.5      NaN  36.00   9.65  14.5   
1          0     2  39.0    NaN  44.20  17.0      NaN  36.00  10.20  13.0   
2          0     3  39.0    NaN  41.35  18.5      NaN  36.00   9.65  14.5   
3          0     4  39.0    NaN  41.35  18.5      NaN  36.00   9.65  14.5   
4          0     5  39.0    NaN  41.35  18.5      NaN  36.00   9.65  14.5   
...      ...   ...   ...    ...    ...   ...      ...    ...    ...   ...   
151963  9997     8  57.0    NaN  27.30  20.0      NaN  37.75   9.90  20.0   
151964  9997     9  57.0    NaN  27.30  20.0      NaN  37.75   9.90  20.0   
151965  9997    10  57.0    NaN  27.30  20.0      NaN  37.75   9.90  20.0   
151966  9997    11  57.0    NaN  27.30  20.0      NaN  37.00   9.90  20.0   
151967  9997    12  57.0    NaN  27.30  20.0      NaN  37.75   9.90  20.0   

        ...  Alkalinephos   SpO2  Bilirubin_direct  Chloride    Hct  \
0   

In [13]:
# impute missing data points
# should we impute it with the same imputer that we've used for train?

imputer = KNNImputer(n_neighbors=5)
df_test_agg_imputed_features = imputer.fit_transform(data_array)

In [14]:
# scale test data
min_max_scaler = preprocessing.StandardScaler()
data_test_scaled = min_max_scaler.fit_transform(df_test_agg_imputed_features)

In [50]:
pd.DataFrame(data_train_scaled).to_csv("./Results/dat_train_scaled.csv")
pd.DataFrame(data_test_scaled).to_csv("./Results/dat_test_scaled.csv")

## Fit a model & Predict

### predict with support vector machine classification and use probabilities

In [88]:
# Random forest Classifier
columns_1 = [test_pids]
for i in range(1, 12):
    clf = RandomForestClassifier(min_samples_leaf=2, class_weight='balanced', oob_score=False, bootstrap=False)
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    print(clf.oob_score)
    # compute probabilites as opposed to predictions
    probability = clf.apply(data_test_scaled)
    probs = [i[1] for i in probability] 
    columns_1.append(probs)
    
    
    probability = clf.predict_proba(data_train_scaled)

    probs = [i[1] for i in probability]            
    tmp = roc_auc_score(y_score= probs, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)

False
ROC AUC for feature LABEL_BaseExcess  :  1.0
False
ROC AUC for feature LABEL_Fibrinogen  :  1.0
False
ROC AUC for feature LABEL_AST  :  1.0
False
ROC AUC for feature LABEL_Alkalinephos  :  1.0
False
ROC AUC for feature LABEL_Bilirubin_total  :  1.0
False
ROC AUC for feature LABEL_Lactate  :  0.9999999999999999
False
ROC AUC for feature LABEL_TroponinI  :  0.9999999999999999
False
ROC AUC for feature LABEL_SaO2  :  1.0
False
ROC AUC for feature LABEL_Bilirubin_direct  :  1.0
False
ROC AUC for feature LABEL_EtCO2  :  1.0
False
ROC AUC for feature LABEL_Sepsis  :  1.0


In [None]:
# first for the labels that have an output [0,1]

columns_1 = [test_pids]

for i in range(1, 12):
    clf = BaggingClassifier(SVC(kernel = 'poly', degree = 5, class_weight = 'balanced', verbose = True, C = 10))
    #parameters = {'C':np.linspace(0.1,10,20)}
#     clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 4,
#                                        refit = True, scoring = 'roc_auc', verbose = 1, n_jobs=6)
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    
    #print(clf.best_params_)
    # compute probabilites as opposed to predictions
    #dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    
    distance_hyperplane = clf.decision_function(data_test_scaled)
    probability = np.empty(len(distance_hyperplane))
    for j in range(0, len(probability)):
        if distance_hyperplane[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplane[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplane[j]))
    columns_1.append(probability)

    
    distance_hyperplace_train = clf.decision_function(data_train_scaled)
    probability = np.empty(len(distance_hyperplace_train))
    for j in range(0, len(probability)):
        if distance_hyperplace_train[j] < 0:
            probability[j] = 1 - 1/(1 + math.exp(distance_hyperplace_train[j]))
        else:
            probability[j] = 1/(1 + math.exp(-distance_hyperplace_train[j]))
    
    tmp = roc_auc_score(y_score= probability, y_true= df_train_labels.iloc[:,i])
    print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)
    

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [27]:
# labels that have a real value
columns_2 = []

for i in range(12, 16):
    clf_w = SVR(kernel = 'poly', degree = 3)
    parameters = {'C':np.linspace(1,10, 10)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                       refit = True, scoring = 'r2', verbose = 1, n_jobs=6)
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    print(clf.cv_results_)
    
    pred_train = clf.predict(data_train_scaled)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)
    
    pred = clf.predict(data_test_scaled)
    columns_2.append(pred)
    

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  8.3min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed: 12.8min finished


{'mean_fit_time': array([ 24.26356025,  37.08510661,  50.63805652,  59.79719958,
        68.22096133,  84.09716425, 100.10133696, 120.14321575,
       141.78595657, 141.88823218]), 'std_fit_time': array([ 0.78193777,  1.58172054,  2.87223471,  3.70161724,  1.75013687,
        5.86425543,  3.22495575,  6.68448067,  5.83171327, 20.06454897]), 'mean_score_time': array([2.76414261, 2.7054904 , 2.60875549, 2.51900043, 2.50029249,
       2.4874114 , 2.61314158, 2.56653919, 2.57755361, 1.94303102]), 'std_score_time': array([0.02598785, 0.01555165, 0.03177854, 0.05287047, 0.02575073,
       0.02144058, 0.12908522, 0.02626385, 0.03907306, 0.27650337]), 'param_C': masked_array(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1.0}, {'C': 2.0}, {'C': 3.0}, {'C': 4.0}, {'C': 5.0}, {'C': 6.0}, {'C': 7.0}, {'C': 8.0}, 

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  3.8min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  5.4min finished


{'mean_fit_time': array([17.71627531, 20.6306324 , 24.16375551, 28.05695519, 32.93605757,
       36.10372167, 40.67268195, 44.08757458, 48.91899505, 47.35820279]), 'std_fit_time': array([0.32516692, 0.26787686, 0.89159049, 0.45229754, 0.54111283,
       1.09467786, 2.37227955, 1.26473053, 1.05429997, 6.68549009]), 'mean_score_time': array([2.60946536, 2.65970502, 2.69947052, 2.68894782, 2.66633039,
       2.64484501, 2.66010704, 2.62202168, 2.62365475, 2.05812993]), 'std_score_time': array([0.01281616, 0.04342416, 0.03741824, 0.03961654, 0.05163414,
       0.03262146, 0.02338802, 0.02707709, 0.02410384, 0.29204509]), 'param_C': masked_array(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1.0}, {'C': 2.0}, {'C': 3.0}, {'C': 4.0}, {'C': 5.0}, {'C': 6.0}, {'C': 7.0}, {'C': 8.0}, {'C': 9.0}, {'C': 10.0}], '

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 13.8min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed: 23.1min finished


{'mean_fit_time': array([ 29.33529305,  50.15217581,  68.32146988,  95.81132073,
       127.0476346 , 155.00124097, 206.52213483, 234.10829382,
       290.46561227, 287.18926678]), 'std_fit_time': array([ 1.3624228 ,  2.75373346,  4.44484461,  7.87358326,  8.3224808 ,
       17.60029208, 35.36992762, 26.67028525, 37.59911886, 29.5342148 ]), 'mean_score_time': array([2.51843181, 2.50076056, 2.49138608, 2.53562341, 2.5059135 ,
       2.51753979, 2.51636314, 2.52095699, 2.55071263, 1.91032915]), 'std_score_time': array([0.02486152, 0.04319786, 0.02573233, 0.02679131, 0.03306611,
       0.00859763, 0.01757959, 0.02285168, 0.0576419 , 0.26239243]), 'param_C': masked_array(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1.0}, {'C': 2.0}, {'C': 3.0}, {'C': 4.0}, {'C': 5.0}, {'C': 6.0}, {'C': 7.0}, {'C': 8.0}, 

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  4.8min finished


{'mean_fit_time': array([16.81290073, 19.68092899, 22.55827308, 26.13143687, 29.55735402,
       33.25632582, 34.88318014, 39.28785138, 42.3173522 , 40.2092742 ]), 'std_fit_time': array([0.53556888, 0.25729745, 0.94174783, 0.95243922, 0.57781273,
       2.97172492, 1.54584047, 2.2654442 , 2.99741734, 5.59257032]), 'mean_score_time': array([2.63369579, 2.65311422, 2.71309643, 2.66118712, 2.73305945,
       2.6698616 , 2.63144407, 2.63539357, 2.65155482, 1.98331952]), 'std_score_time': array([0.03272501, 0.0554388 , 0.04434087, 0.05863746, 0.07289693,
       0.03234807, 0.01612869, 0.02253627, 0.04394569, 0.25927067]), 'param_C': masked_array(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1.0}, {'C': 2.0}, {'C': 3.0}, {'C': 4.0}, {'C': 5.0}, {'C': 6.0}, {'C': 7.0}, {'C': 8.0}, {'C': 9.0}, {'C': 10.0}], '

In [28]:
columns_final = columns_1 + columns_2

### predict with Support vector regression and then compute sigmoid function

In [None]:
# first for the labels that have an output [0,1]

# columns_1 = [test_pids]

# for i in range(1,12):
    
#     clf = SVR(kernel = 'poly', degree = 3, max_iter = 10000)
#     clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
#     pred = clf.predict(data_test_scaled)
#     prob = np.empty(len(pred))
#     for j in range(0, len(pred)):
#         prob[j] = 1 / (1 + math.exp(-pred[j]))
#     columns_1.append(prob)
    
#     pred_train = clf.predict(data_train_scaled)
#     prob_train = np.empty(len(pred_train))
#     for j in range(0, len(pred_train)):
#         prob_train[j] = 1 / (1 + math.exp(-pred_train[j]))    
#     tmp = roc_auc_score(y_score= prob_train, y_true= df_train_labels.iloc[:,i])
#     print("ROC AUC for feature", list(df_train_labels)[i] , " : ", tmp)


In [31]:
#labels that have a real value

columns_2 = []

for i in range(12, 16):
    clf_w = LinearSVR()
    parameters = {'C':np.linspace(0.1,10, 20)}
    clf = model_selection.GridSearchCV(estimator= clf_w, param_grid = parameters, cv = 5,
                                       refit = True, scoring = 'r2', verbose = 1, n_jobs=6)
    
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    print(clf.cv_results_)
    pred = clf.predict(data_test_scaled)
    columns_2.append(pred)
    
    pred_train = clf.predict(data_train_scaled)
    tmp = r2_score(y_pred= pred_train, y_true=df_train_labels.iloc[:,i])
    print("R2 for feature", list(df_train_labels)[i] , " : ", tmp)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   21.3s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  1.1min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


{'mean_fit_time': array([0.37665977, 1.84511938, 2.86973948, 3.4093596 , 3.73474226,
       3.86177664, 3.82795773, 3.86649203, 4.12736535, 4.14848909,
       4.05751448, 4.14853668, 4.22228336, 4.19334617, 4.31581178,
       4.339077  , 4.24260187, 4.19198799, 4.1739429 , 3.61826892]), 'std_fit_time': array([0.0301936 , 0.17417734, 0.1671176 , 0.09573891, 0.13961228,
       0.03360373, 0.03236656, 0.04460098, 0.09393808, 0.03945186,
       0.05634337, 0.03362465, 0.028355  , 0.04193226, 0.04294149,
       0.0659562 , 0.03528951, 0.03726532, 0.03116721, 0.30399726]), 'mean_score_time': array([0.00148292, 0.00127683, 0.00117359, 0.00118756, 0.0012589 ,
       0.00117841, 0.00114655, 0.00125933, 0.00123529, 0.00113654,
       0.00119953, 0.00126786, 0.00099845, 0.00114436, 0.00136724,
       0.00132933, 0.001373  , 0.00125742, 0.00119443, 0.00080848]), 'std_score_time': array([2.17440013e-04, 1.30272278e-04, 1.93977250e-04, 2.64423596e-04,
       2.00888920e-04, 2.07395365e-04, 2.3624887

[Parallel(n_jobs=6)]: Done  50 tasks      | elapsed:   19.1s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   49.9s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


{'mean_fit_time': array([0.14033079, 0.76479621, 1.34748406, 1.83143563, 2.17888765,
       2.42992315, 2.78290434, 3.10851498, 3.13098316, 3.39342022,
       3.45551186, 3.55833025, 3.68782778, 3.71560326, 3.77993011,
       3.69346962, 3.90296822, 3.97981229, 3.8629849 , 3.52852173]), 'std_fit_time': array([0.0136319 , 0.09668082, 0.10955783, 0.10771298, 0.19673337,
       0.11704928, 0.11397408, 0.09703625, 0.12530069, 0.18489637,
       0.13984047, 0.19602886, 0.12019974, 0.13370317, 0.14944065,
       0.10712289, 0.17877291, 0.15506792, 0.11604887, 0.29402882]), 'mean_score_time': array([0.00104618, 0.00140123, 0.001087  , 0.00121737, 0.00115862,
       0.00129442, 0.00124364, 0.0013597 , 0.00113182, 0.00134597,
       0.00129452, 0.00117755, 0.00130978, 0.00124478, 0.00145187,
       0.00120525, 0.00149841, 0.00116839, 0.0011085 , 0.00085721]), 'std_score_time': array([2.20926791e-04, 2.87246617e-04, 2.46704753e-04, 4.19097438e-04,
       1.69461150e-04, 2.21574504e-04, 3.0394878

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   23.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  1.1min finished


{'mean_fit_time': array([0.70487409, 3.01477675, 3.55682173, 3.71108637, 3.82644238,
       3.96093736, 4.06440988, 4.16327481, 4.1999311 , 4.3748807 ,
       4.43286967, 4.32352052, 4.43522792, 4.50657101, 4.65875335,
       4.63944921, 4.64045367, 4.60776439, 4.77660542, 4.09745331]), 'std_fit_time': array([0.08916813, 0.24264078, 0.16550176, 0.10907231, 0.06963476,
       0.07505626, 0.03917425, 0.04177427, 0.02236836, 0.07406129,
       0.07014707, 0.03787632, 0.08255509, 0.0513171 , 0.07843107,
       0.04800747, 0.06153179, 0.04507189, 0.03342084, 0.23454754]), 'mean_score_time': array([0.0011838 , 0.00125375, 0.00121236, 0.00094385, 0.0011343 ,
       0.00117741, 0.00117011, 0.00113602, 0.00124197, 0.00105262,
       0.00113435, 0.00118966, 0.00138984, 0.0012198 , 0.00113382,
       0.0011395 , 0.00116291, 0.0013875 , 0.00130858, 0.00091629]), 'std_score_time': array([1.78717740e-04, 1.85774043e-04, 2.47979988e-04, 9.14088748e-05,
       2.13854826e-04, 2.33592620e-04, 2.3436866

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  50 tasks      | elapsed:   18.9s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   49.9s finished


{'mean_fit_time': array([0.14238529, 0.8435739 , 1.40691385, 1.91267419, 2.07772284,
       2.34503956, 2.58634086, 2.82495112, 3.09933977, 3.33921504,
       3.41598296, 3.510497  , 3.78512964, 3.70053749, 3.76686277,
       3.90678439, 3.88004794, 3.70462565, 4.00099239, 3.63537078]), 'std_fit_time': array([0.03954871, 0.24611757, 0.37274413, 0.48070649, 0.24093776,
       0.27298437, 0.2041579 , 0.15408963, 0.19853701, 0.15694872,
       0.22033353, 0.13023352, 0.18470883, 0.17382976, 0.10250573,
       0.12075567, 0.17414585, 0.10281227, 0.13496137, 0.44413943]), 'mean_score_time': array([0.00110798, 0.00110674, 0.00111575, 0.0011076 , 0.00114546,
       0.00123849, 0.00123682, 0.00120792, 0.00123782, 0.00132041,
       0.00124679, 0.00183454, 0.00125823, 0.0012773 , 0.00121498,
       0.00120158, 0.00132957, 0.00105333, 0.00145855, 0.00082092]), 'std_score_time': array([2.20028052e-04, 2.83314273e-04, 1.86492128e-04, 1.91320569e-04,
       1.79006003e-04, 8.04860444e-05, 1.5544478



In [89]:
columns_final = columns_1 + columns_2

## Save predictions

In [90]:
print(np.shape(columns_final))
result = pd.DataFrame(columns_final).transpose()
result.columns = list(df_train_labels)
result.to_csv('./Results/prediction.csv.zip', index=False, float_format='%.3f', compression='zip')

(16, 12664)


In [None]:
result.to_csv('./Results/prediction.csv', index=False, float_format='%.3f')