# Support vector machines

In [None]:
# import libraries

import numpy as np
import math
import pandas as pd

from matplotlib.backends.backend_pdf import PdfPages

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import pairwise_kernels

## Data pre-processing

In [None]:
# load training data

# load data from csv file
df_train_features = pd.read_csv ('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')

# Load test data
df_test_features = pd.read_csv ('test_features.csv')

 ### Histogram of the output labels 

We should check for class imbalance.

In [None]:
df_train_labels.hist()

with PdfPages("./Results/Labels_histogram.pdf") as export_pdf:
    for i in list(df_train_labels)[1:]:
        df_train_labels.hist(column = i, bins = 100)
        export_pdf.savefig()

One can see the class imbalance problem here. Other observations:
  * Heartrate, RRate, ABPm,  distribution is similar to a normal distribution
  * SpO2 is like a censored normal distribution. 
  * For all of the other features, class imbalance is an obvious problem.

A basic strategy that could be used here: Upsample both classes! Do the upsampling efficiently, not just replicating the datapoints

### Train Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_train_features.agg([min, max]))

# how much missing data? 
print("number of missing values:")
print(df_train_features.isnull().sum(axis=0))

### Train data pre-processing

In [None]:
# aggregate data for each pid
# GROUPBY REARRANGES THE ROWS, WE HAVE TO DO THE SAME FOR THE LABELS
df_train_aggregate_features = df_train_features.groupby('pid').agg('sum')

# print(df_train_aggregate_features)

In [None]:
# remove time from data frame 
df_train_agg_features = df_train_aggregate_features.drop(['Time'], axis = 1)
# print(df_train_agg_features)

In [None]:
# impute missing data points
imp = SimpleImputer(strategy="mean")
df_train_agg_imputed_features = imp.fit_transform(df_train_agg_features)
#print(df_train_agg_imputed_features)

In [None]:
# scale the data
min_max_scaler = preprocessing.MinMaxScaler()
# standard_scalar = preprocessing.StandardScaler()

data_train_scaled = min_max_scaler.fit_transform(df_train_agg_imputed_features)

In [None]:
# REARRANGE THE LABELS, TO MATCH THE REARRANGED FEATURES
df_train_labels_sorted = df_train_labels.sort_values(by = 'pid')
# print(df_train_labels_sorted)

### Test Data pre-processing

In [None]:
# data inspection: 
#############################################
# range of the provided data?
print(df_test_features.agg([min, max]))

# how much missing data? 
print("number of missing values:")
print(df_test_features.isnull().sum(axis=0))

### Test data pre-processing

In [None]:
# aggregate data for each pid
df_test_aggregate_features = df_test_features.groupby('pid').agg('sum')

#print(df_test_aggregate_features)

# collect all test pids
test_pids = list(set(df_test_features.pid))

In [None]:
# remove time from data frame 
df_test_agg_features = df_test_aggregate_features.drop(['Time'], axis = 1)
# print(df_test_agg_features)

In [None]:
# impute missing data points
df_test_agg_imputed_features = imp.transform(df_test_agg_features)

In [None]:
# scale test data
data_test_scaled = min_max_scaler.transform(df_test_agg_imputed_features)

## Fit a model & Predict

### predict with support vector machine classification and use probabilities

In [75]:
# first for the labels that have an output [0,1]

columns_1 = [test_pids]

for i in range(1, 12):
    clf = SVC(class_weight = 'balanced', verbose = True)
    clf.fit(data_train_scaled, df_train_labels_sorted.iloc[:,i])
    # pred = clf.predict(df_test_agg_imputed_features)
    # columns_1.append(pred)
     
    # compute probabilites as opposed to predictions
    dual_coefficients = clf.dual_coef_    # do we have to normalize with norm of this vector ?
    distance_hyperplane = clf.decision_function(data_test_scaled)
    probability = np.empty(len(distance_hyperplane))
    for i in range(0, len(probability)):
        probability[i] = 1 / (1 + math.exp(- distance_hyperplane[i]))
    columns_1.append(probability)
    

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [79]:
# labels that have a real value
columns_2 = []

for i in range(12, 16):
    clf = SVR()
    clf.fit(data_train_scaled, df_train_labels.iloc[:,i])
    pred = clf.predict(data_test_scaled)
    columns_2.append(pred)

In [76]:
columns_final = columns_1 + columns_2


### predict with Support vector regression and then compute sigmoid function

In [None]:
# first for the labels that have an output [0,1]

columns = [test_pids]

for i in range(1,12):
    clf = LinearSVR(fit_intercept = True, max_iter = 1000)
    clf.fit(df_train_agg_imputed_features, df_train_labels.iloc[:,i])
    pred = clf.predict(df_test_agg_imputed_features)
    prob = np.empty(len(pred))
    for i in range(0, len(pred)):
        prob[i] = 1 / (1 + math.exp(-pred[i]))
    columns.append(prob)

In [None]:
# labels that have a real value
for i in range(12, 16):
    clf = LinearSVR()
    clf.fit(df_train_agg_imputed_features, df_train_labels.iloc[:,i])
    pred = clf.predict(df_test_agg_imputed_features)
    columns.append(pred)

## Save predictions

In [77]:
print(np.shape(columns_final))
result = pd.DataFrame(columns_final).transpose()
result.columns = list(df_train_labels)
result.to_csv('./Results/prediction.csv.zip', index=False, float_format='%.3f', compression='zip')

(16, 12664)


In [78]:
result.to_csv('./Results/prediction.csv', index=False, float_format='%.3f')