In [2]:
import utils
import pandas as pd

In [3]:
df_list = utils.read_data('train')

## Split entrenamiento validación

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
train_sub, val_sub = train_test_split(np.arange(1, 31), train_size = 0.8, random_state = 42)

## Gráficos

In [None]:
subj = df_list[0]
subj.head()

### Amplitud vs Tiempo

In [None]:

utils.plot_channel(subj, 'channel1')

In [None]:
utils.plot_channel(subj, 'channel3')

In [None]:
utils.plot_channel(subj, 'channel5')

### Amplitud vs Frecuencia

In [None]:
utils.plot_freq(subj, 'channel1')

In [None]:
utils.plot_freq(subj, 'channel3')

In [None]:
utils.plot_freq(subj, 'channel5')

## Correlaciones

In [None]:
val_list = []
for df in df_list:
    if df['subject'].unique() in val_sub:
      val_list.append(df)

df_val = pd.concat(val_list, ignore_index=True)

In [None]:
non_features = ['class', 'subject', 'capture', 'time']
short_corr = lambda d,m: d.drop(non_features, axis=1).corr(method=m).abs()
corr_methods = ['pearson', 'kendall', 'spearman']
reduced_data  = df_val[(df_val['class']!=0) & (df_val['class']!=1)] # Se quitan las clases que no interesan

In [None]:
reduced_corr = {}
for method in corr_methods:
  reduced_corr[method] = short_corr(reduced_data, method)

In [None]:
for key in reduced_corr:
  top = reduced_corr[key].where(np.tril(np.ones(reduced_corr[key].shape), -1).astype(bool)).stack() # Se toma la triangular para evitar repetir pares
  top = top.sort_values(ascending = False)
  print(f'Canales más correlacionados según: {key}')
  print(top[:5])

In [None]:
from sklearn.feature_selection import mutual_info_regression
mi = mutual_info_regression((reduced_data).drop(non_features, axis=1), reduced_data['class'])

In [None]:
print('Mutual information')
for i in range(len(mi)):
  print(f'Canal{i+1} : {round(mi[i], 4)}')

## Features

In [7]:
def data_range(x):
  return x.max()-x.min()

def rms(x):
  z = x*x
  sum = z.sum()
  result = np.sqrt(sum/len(x))

  return result

# Zero crossing rate
def zcr(x):
  x = np.array(x)
  n = len(x)
  zc = ((x[:-1] * x[1:]) < 0).sum()
  return zc/n

def mcr(x):
  x = np.array(x)
  z = x-np.mean(x)
  return zcr(z)

#waveform length
def wl(data):
    return np.sum(np.abs(np.diff(data,axis=0)), axis=0)

features = [wl, 'mad', mcr, rms]

## Grids

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [9]:
# pre_grids
param_pregrid_linear = [{
                    'C':[0.0001, 0.001, 0.1, 1, 10],
                    'kernel': ['linear']
                  }]

param_pregrid_forest = [{
    'n_estimators':[150, 250],
    'max_depth': [None],
    'criterion' : ['entropy']
                  }]
          
param_pregrid_knn = [{
    'n_neighbors':[5, 10, 15],
    'algorithm': ['ball_tree', 'kd_tree']
}]

In [10]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV


In [11]:
common_pregrid = {'scoring':'balanced_accuracy', 'refit':True, 'verbose':1}

In [20]:
pregrid_dict = {
    'linear':lambda x :  GridSearchCV(estimator= svm.SVC(), param_grid = param_pregrid_linear,
                                      cv = x, **common_pregrid),
                              
    'forest': lambda x : GridSearchCV(estimator= RandomForestClassifier(),
                     param_grid = param_pregrid_forest, cv = x , **common_pregrid),
                
    'knn' : lambda x : GridSearchCV(estimator = KNeighborsClassifier(),
                         param_grid = param_pregrid_knn, cv = x, **common_pregrid)
}

## Simplificación de entrenamiento

In [21]:
def train_with_wrapper(df_list, train_sub, val_sub, features, grid_dict):
  wrapper = utils.TrainWrapper(df_list)
  wrapper.split(train_sub, val_sub) # Split train/val
  wrapper.make_windows() # Crear ventanas
  wrapper.compute_features(features)
  wrapper.make_test_folds() # Crear x_tv, y_tv y cv

  train_dict = {}
  for key in grid_dict:
    train_dict[key] = grid_dict[key](wrapper.cv) # Setea cross-validation fold
  
  wrapper_trained = utils.multitrain(train_dict, wrapper.x_tv, wrapper.y_tv)

  return wrapper_trained, wrapper

In [22]:
preparams = [train_sub, val_sub, features, pregrid_dict]

## Prueba 1 

In [15]:
channels = [f'channel{i}' for i in range(1,9)]

In [16]:
df_list1 = [utils.substract_mean(x, channels) for x in df_list]

In [23]:
test1_trained, test1_wrapper = train_with_wrapper(df_list1, *preparams)

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [24]:
for x in test1_trained:
  print(f'{x}: {test1_trained[x][1]}')

linear: {'C': 0.1, 'kernel': 'linear'}
forest: {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 150}
knn: {'algorithm': 'ball_tree', 'n_neighbors': 10}


In [25]:
import sklearn.metrics as metrics
pre_accuracies = {}
for x in test1_trained:
  cl = test1_trained[x][0]

  pre_predictv = cl.predict(test1_wrapper.x_val)
  pre_predict = cl.predict(test1_wrapper.x_train)
  pre_accuracies[x] = (metrics.accuracy_score(test1_wrapper.y_train, pre_predict),
                       metrics.accuracy_score(test1_wrapper.y_val, pre_predictv))

In [26]:
pre_accuracies

{'forest': (1.0, 1.0),
 'knn': (0.9667097608274079, 0.9805194805194806),
 'linear': (0.8416289592760181, 0.9264069264069265)}

## Prueba 1.5 (Elegir caracteristicas extra)

In [107]:
extra_features = ['kurtosis', 'skew', data_range]

In [109]:
pre_accsf = {}
for f in extra_features:
  nparams = [train_sub, val_sub, features + [f], pregrid_dict]
  testf_trained, testf = train_with_wrapper(df_list1, *nparams)

  pre_accsf[f] = {}
  for x in testf_trained:
    cl = testf_trained[x][0]

    pre_predictv = cl.predict(testf.x_val)
    pre_predict = cl.predict(testf.x_train)
    pre_accsf[f][x] = (metrics.accuracy_score(testf.y_train, pre_predict),
                        metrics.accuracy_score(testf.y_val, pre_predictv))

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [111]:
pre_accuracies

{'forest': (1.0, 1.0),
 'knn': (0.9667097608274079, 0.9805194805194806),
 'linear': (0.8416289592760181, 0.9264069264069265)}

In [110]:
for key in pre_accsf:
  print(key)
  print(pre_accsf[key])

kurtosis
{'linear': (0.8445378151260504, 0.9296536796536796), 'forest': (1.0, 1.0), 'knn': (0.967032967032967, 0.9826839826839827)}
skew
{'linear': (0.8506787330316742, 0.9491341991341992), 'forest': (1.0, 1.0), 'knn': (0.9547511312217195, 0.9675324675324676)}
<function data_range at 0x7f873827d830>
{'linear': (0.8587588881706528, 0.9502164502164502), 'forest': (1.0, 1.0), 'knn': (0.9776987718164188, 0.987012987012987)}


In [113]:
nparams = [train_sub, val_sub, features + [data_range, 'kurtosis'], pregrid_dict]
testf_trained, testf = train_with_wrapper(df_list1, *nparams)

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [114]:
pre_accsf['skew + range'] = {}

In [115]:
for x in testf_trained:
    cl = testf_trained[x][0]

    pre_predictv = cl.predict(testf.x_val)
    pre_predict = cl.predict(testf.x_train)
    pre_accsf['skew + range'][x] = (metrics.accuracy_score(testf.y_train, pre_predict),
                        metrics.accuracy_score(testf.y_val, pre_predictv))

In [116]:
for key in pre_accsf:
  print(key)
  print(pre_accsf[key])

kurtosis
{'linear': (0.8445378151260504, 0.9296536796536796), 'forest': (1.0, 1.0), 'knn': (0.967032967032967, 0.9826839826839827)}
skew
{'linear': (0.8506787330316742, 0.9491341991341992), 'forest': (1.0, 1.0), 'knn': (0.9547511312217195, 0.9675324675324676)}
<function data_range at 0x7f873827d830>
{'linear': (0.8587588881706528, 0.9502164502164502), 'forest': (1.0, 1.0), 'knn': (0.9776987718164188, 0.987012987012987)}
skew + range
{'linear': (0.8616677440206852, 0.9534632034632035), 'forest': (1.0, 1.0), 'knn': (0.9776987718164188, 0.9891774891774892)}


## Caracteristicas definitivas

In [27]:
features += [data_range, 'kurtosis']
preparams = [train_sub, val_sub, features, pregrid_dict]

## Prueba 2 (Sin canal 2 o 3 + P1)

In [118]:
pre_accs2 = {'channel2':{}, 'channel3':{}}
for ch in ['channel2', 'channel3']:
    df_list2 = [x.drop([ch], axis=1) for x in df_list1]
    test2_trained, test2_wrapper = train_with_wrapper(df_list2, *preparams)
    for x in test2_trained:
      cl = test2_trained[x][0]

      pre_predictv = cl.predict(test2_wrapper.x_val)
      pre_predict = cl.predict(test2_wrapper.x_train)

      pre_accs2[ch][x] = (metrics.accuracy_score(test2_wrapper.y_train, pre_predict),
                          metrics.accuracy_score(test2_wrapper.y_val, pre_predictv))

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [119]:
for key in pre_accs2:
  print(key)
  print(pre_accs2[key])

channel2
{'linear': (0.8422753716871364, 0.9404761904761905), 'forest': (1.0, 1.0), 'knn': (0.97349709114415, 0.9848484848484849)}
channel3
{'linear': (0.8584356819650937, 0.9502164502164502), 'forest': (1.0, 1.0), 'knn': (0.9747899159663865, 0.9880952380952381)}


## Prueba 3 (Filtros + P2)

In [120]:
channels_p3 = [x for x in channels if x!='channel3']

In [122]:
df_list3 = [x.drop(['channel3'], axis=1) for x in df_list1]

In [128]:
filters = [ {'cutoffs' : 350, 'btype':'low'}, {'cutoffs' : 400, 'btype':'low'}, 
           {'cutoffs' : 1, 'btype':'high'}, {'cutoffs' : 12, 'btype':'high'}]

In [129]:
pre_accs3 = {}
for f in filters:
  f['channels'] = channels_p3
  df_listf = [utils.filtrar_df(df_list3[i], **f) for i in range(len(df_list3))]
  test3_trained, test3_wrapper = train_with_wrapper(df_listf, *preparams)

  key = f"{f['cutoffs']}-{f['btype']}"
  pre_accs3[key] = {}

  for x in test3_trained:
      cl = test3_trained[x][0]
      pre_predictv = cl.predict(test3_wrapper.x_val)
      pre_predict = cl.predict(test3_wrapper.x_train)

      pre_accs3[key][x] = (metrics.accuracy_score(test3_wrapper.y_train, pre_predict),
                           metrics.accuracy_score(test3_wrapper.y_val, pre_predictv))

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [141]:
for key in pre_accs3:
  print(key)
  print(pre_accs3[key])

350-low
{'linear': (0.8487394957983193, 0.946969696969697), 'forest': (1.0, 1.0), 'knn': (0.9615384615384616, 0.9805194805194806)}
400-low
{'linear': (0.8484162895927602, 0.9458874458874459), 'forest': (1.0, 1.0), 'knn': (0.9621848739495799, 0.9794372294372294)}
1-high
{'linear': (0.8574660633484162, 0.9512987012987013), 'forest': (1.0, 1.0), 'knn': (0.9738202973497091, 0.9913419913419913)}
12-high
{'linear': (0.8561732385261797, 0.9361471861471862), 'forest': (1.0, 1.0), 'knn': (0.964770523594053, 0.9816017316017316)}


## Prueba 4 (Ventanas)

In [40]:
win_step = [(800, 250), (800, 150), (800, 350), (750, 250), (950, 250)] # pares (window_size, step)

In [41]:
df_listv = [x.drop(['channel3'], axis=1) for x in df_list1]

wrapper = utils.TrainWrapper(df_listv)
wrapper.split(train_sub, val_sub)

ws_results = {}
train_dict = {}

for ws in win_step:
  wrapper.make_windows(*ws)
  wrapper.compute_features(features)
  wrapper.make_test_folds()

  for key in pregrid_dict:
      train_dict[key] = pregrid_dict[key](wrapper.cv)

  trained = utils.multitrain(train_dict, wrapper.x_tv, wrapper.y_tv)
  key = str(ws)
  ws_results[key] = {}

  for x in trained:
    cl = trained[x][0]

    pre_predictv = cl.predict(wrapper.x_val)
    pre_predict = cl.predict(wrapper.x_train)
    ws_results[key][x] = (metrics.accuracy_score(wrapper.y_train, pre_predict),
                        metrics.accuracy_score(wrapper.y_val, pre_predictv))


Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [44]:
for key in ws_results:
  print(key)
  print(ws_results[key])

(800, 250)
{'linear': (0.8584356819650937, 0.9502164502164502), 'forest': (1.0, 1.0), 'knn': (0.9747899159663865, 0.9880952380952381)}
(800, 150)
{'linear': (0.868155762008302, 0.9518469656992085), 'forest': (1.0, 1.0), 'knn': (0.9901166238387032, 0.996042216358839)}
(800, 350)
{'linear': (0.8477970627503337, 0.9372197309417041), 'forest': (1.0, 1.0), 'knn': (0.9439252336448598, 0.9701046337817638)}
(750, 250)
{'linear': (0.8589580686149937, 0.951063829787234), 'forest': (1.0, 1.0), 'knn': (0.9723634053367217, 0.9893617021276596)}
(950, 250)
{'linear': (0.8605686879068174, 0.9452679589509693), 'forest': (1.0, 1.0), 'knn': (0.9938335046248715, 0.9965792474344356)}


In [45]:
wrapper.make_windows(*(950, 150))
wrapper.compute_features(features)
wrapper.make_test_folds()

for key in pregrid_dict:
      train_dict[key] = pregrid_dict[key](wrapper.cv)

trained = utils.multitrain(train_dict, wrapper.x_tv, wrapper.y_tv)
key = '(950, 150)'
ws_results[key] = {}

for x in trained:
    cl = trained[x][0]

    pre_predictv = cl.predict(wrapper.x_val)
    pre_predict = cl.predict(wrapper.x_train)
    ws_results[key][x] = (metrics.accuracy_score(wrapper.y_train, pre_predict),
                        metrics.accuracy_score(wrapper.y_val, pre_predictv))

Fitting 1 folds for each of 5 candidates, totalling 5 fits
Fitting 1 folds for each of 2 candidates, totalling 2 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits


In [76]:
for key in ws_results:
  print(key)
  print(ws_results[key])

(800, 250)
{'linear': (0.8584356819650937, 0.9502164502164502), 'forest': (1.0, 1.0), 'knn': (0.9747899159663865, 0.9880952380952381)}
(800, 150)
{'linear': (0.868155762008302, 0.9518469656992085), 'forest': (1.0, 1.0), 'knn': (0.9901166238387032, 0.996042216358839)}
(800, 350)
{'linear': (0.8477970627503337, 0.9372197309417041), 'forest': (1.0, 1.0), 'knn': (0.9439252336448598, 0.9701046337817638)}
(750, 250)
{'linear': (0.8589580686149937, 0.951063829787234), 'forest': (1.0, 1.0), 'knn': (0.9723634053367217, 0.9893617021276596)}
(950, 250)
{'linear': (0.8605686879068174, 0.9452679589509693), 'forest': (1.0, 1.0), 'knn': (0.9938335046248715, 0.9965792474344356)}
(950, 150)
{'linear': (0.8681618109411025, 0.9508310249307479), 'forest': (1.0, 1.0), 'knn': (0.9907776147558164, 0.9965373961218836)}
