In [2]:
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import pickle
import os

import bct
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

data_path = 'AMIGOS/preprocessed/'

big5_params = ['Extroversion', 'Agreeableness', 'Conscientiousness', 
               'Emotional Stability', 'Creativity (openness)']

In [23]:
for name in [x for x in os.listdir(data_path) if '_dwt_' in x and 'func_data_emb.pkl' in x]:
    with open(data_path + name, 'rb') as f:
        df = pickle.load(f)
        
# df.Video_valence = (df.Video_category.str[0] == 'H').astype(int)
# df.Video_arousal = (df.Video_category.str[2] == 'H').astype(int)

df.drop(['EEG', 'dwt', 'Video_category'], axis=1, inplace=True)

In [48]:
df = df[((df.Video_arousal <= 4) | (df.Video_arousal >= 6.3)) &
  ((df.Video_valence <= 4) | (df.Video_valence >= 6.3))].reset_index(drop=True)

In [49]:
df.Video_valence = (df.Video_valence > df.Video_valence.median()).astype(int)
df.Video_arousal = (df.Video_arousal > df.Video_arousal.median()).astype(int)

In [51]:
results_table_2 = []

In [52]:
for target in ['Video_valence', 'Video_arousal']:#big5_params:
    print(target)
    
    if target in big5_params:
        categ_col_names = ['Video_arousal', 'Video_valence', 'Video_neutral', 
                           'Video_disgust', 'Video_happiness', 'Video_surprise',
                            'Video_anger', 'Video_fear', 'Video_sadness']
        X_video_info = df[['Video_arousal', 'Video_valence', 
                           'Video_dominance', 'Video_liking', 'Video_familiarity', 
                           'Video_neutral', 'Video_disgust', 'Video_happiness', 
                           'Video_surprise', 'Video_anger', 'Video_fear', 'Video_sadness']]
        X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)
        y = (df[target] > df[target].median()).astype(int)
    else:
        categ_col_names = ['Video_neutral', 'Video_disgust', 'Video_happiness', 'Video_surprise',
                            'Video_anger', 'Video_fear', 'Video_sadness']
        X_video_info = df[['Video_dominance', 'Video_familiarity', 
                           'Video_neutral', 'Video_disgust', 'Video_happiness', 
                           'Video_surprise', 'Video_anger', 'Video_fear', 'Video_sadness']]
        X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)
        y = df[target]
        
    X_spectral_features = df[[x for x in df.columns if 'dwt_emb_' in x]]
    
    features_lines = {}
    func_features_dict = {}
        
    for fp in ['im_coh_2', 'im_coh_3', 'pearson_corr_2', 'pearson_corr_3']:
        print(fp)
        if fp not in features_lines.keys():
            features_lines[fp] = []
            for matrix in df[fp]:
                matrix_min = np.apply_along_axis(min, 0, matrix)
                matrix_sum = np.apply_along_axis(sum, 0, matrix)
                matrix_mean = np.apply_along_axis(np.mean, 0, matrix)
                func_features = np.concatenate([matrix_sum[np.triu_indices(14, k = 1)], 
                                            matrix_mean[np.triu_indices(14, k = 1)],
                                        bct.centrality.betweenness_wei(bct.weight_conversion(matrix_min, wcm='lengths')),
                                        bct.centrality.betweenness_wei(bct.weight_conversion(matrix_mean, wcm='lengths'))])
                features_lines[fp].append(func_features)
        
    for func_params in [['im_coh_2'], ['im_coh_3'], ['pearson_corr_2'], ['pearson_corr_3'],
                       ['im_coh_2', 'pearson_corr_2'], ['pearson_corr_3', 'im_coh_3']]:
        X_func_features = pd.DataFrame(np.hstack([np.vstack(features_lines[fp]) for fp in func_params]))
        X_func_features.columns = ['func_{}'.format(i) for i in range(X_func_features.shape[1])]
        
        ## case 1
#         X = pd.concat([X_video_info, X_spectral_features, X_func_features], axis=1)
#         categ_col_ids = [X.columns.tolist().index(ccn) for ccn in categ_col_names]
        
#         metrics_prec = []
#         metrics_recall = []
#         metrics_f1 = []
#         metrics_acc = []
#         for iter_ in range(5):
#             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
#             clf = CatBoostClassifier()
#             clf.fit(X_train, y_train, use_best_model=True, cat_features=categ_col_ids,
#                     eval_set=(X_test, y_test), verbose=0)
            
#             metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
#             metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
#             metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
#             metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
#         results_table_2.append((target, 'info_spectr_func', ' '.join(func_params), 
#                               np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
#         print('case 1 finished')
        
        ## case 2
        X = pd.concat([X_spectral_features, X_func_features], axis=1)
        
        metrics_prec = []
        metrics_recall = []
        metrics_f1 = []
        metrics_acc = []
        for iter_ in range(5):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
            clf = CatBoostClassifier()
            clf.fit(X_train, y_train, use_best_model=True,
                    eval_set=(X_test, y_test), verbose=0)
            
            metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
            metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
            metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
            metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
        results_table_2.append((target, 'spectr_func', ' '.join(func_params), 
                              np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
        print('case 2 finished')
        
        ## case 3
#         X = pd.concat([X_video_info, X_func_features], axis=1)
#         categ_col_ids = [X.columns.tolist().index(ccn) for ccn in categ_col_names]
        
#         metrics_prec = []
#         metrics_recall = []
#         metrics_f1 = []
#         metrics_acc = []
#         for iter_ in range(5):
#             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
#             clf = CatBoostClassifier()
#             clf.fit(X_train, y_train, use_best_model=True, cat_features=categ_col_ids,
#                     eval_set=(X_test, y_test), verbose=0)
            
#             metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
#             metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
#             metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
#             metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
#         results_table_2.append((target, 'info_func', ' '.join(func_params), 
#                               np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
#         print('case 3 finished')
        
        ## case 4
#         X = pd.concat([X_video_info, X_spectral_features], axis=1)
#         categ_col_ids = [X.columns.tolist().index(ccn) for ccn in categ_col_names]
        
#         metrics_prec = []
#         metrics_recall = []
#         metrics_f1 = []
#         metrics_acc = []
#         for iter_ in range(5):
#             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
#             clf = CatBoostClassifier()
#             clf.fit(X_train, y_train, use_best_model=True, cat_features=categ_col_ids,
#                     eval_set=(X_test, y_test), verbose=0)
            
#             metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
#             metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
#             metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
#             metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
#         results_table_2.append((target, 'info_spectr', '', 
#                               np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
#         print('case 4 finished')
        
        ## case 5
        X = pd.concat([X_spectral_features], axis=1)
        
        metrics_prec = []
        metrics_recall = []
        metrics_f1 = []
        metrics_acc = []
        for iter_ in range(5):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
            clf = CatBoostClassifier()
            clf.fit(X_train, y_train, use_best_model=True,
                    eval_set=(X_test, y_test), verbose=0)
            
            metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
            metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
            metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
            metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
        results_table_2.append((target, 'spectr', '', 
                              np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
        print('case 5 finished')
        
        ## case 6
        X = pd.concat([X_func_features], axis=1)
        
        metrics_prec = []
        metrics_recall = []
        metrics_f1 = []
        metrics_acc = []
        for iter_ in range(5):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            
            clf = CatBoostClassifier()
            clf.fit(X_train, y_train, use_best_model=True, 
                    eval_set=(X_test, y_test), verbose=0)
            
            metrics_prec.append(precision_score(y_test, clf.predict(X_test)))
            metrics_recall.append(recall_score(y_test, clf.predict(X_test)))
            metrics_f1.append(f1_score(y_test, clf.predict(X_test)))
            metrics_acc.append(accuracy_score(y_test, clf.predict(X_test)))
            
        results_table_2.append((target, 'func', ' '.join(func_params), 
                              np.mean(metrics_prec), np.mean(metrics_recall), np.mean(metrics_f1), np.mean(metrics_acc)))
        print('case 6 finished')
        
    print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
    print(target, 'finished')
    print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
    print(results_table_2[-1])
    
    with open('results_val_ar_new_target.pkl', 'wb') as f:
        pickle.dump(results_table_2, f)

Video_valence
im_coh_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)


im_coh_3
pearson_corr_2
pearson_corr_3
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Video_valence finished
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
('Video_valence', 'func', 'pearson_corr_3 im_coh_3', 0.6282385657266245, 0.6372691227529937, 0.6272028564271599, 0.6175438596491228)
Video_arousal
im_coh_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)


im_coh_3
pearson_corr_2
pearson_corr_3
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
case 2 finished
case 5 finished
case 6 finished
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Video_arousal finished
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
('Video_arousal', 'func', 'pearson_corr_3 im_coh_3', 0.6978260869565217, 0.5662432487794, 0.6219320874057715, 0.6421052631578947)


In [53]:
res = pd.DataFrame(results_table_2, columns=['target', 'features', 'func_type', 
                                           'precision', 'recall', 'f1', 'accuracy'])

In [55]:
res.func_type = res.func_type.str.replace('im_coh', 'ImCoh').str.replace('pearson_corr', 'Pearson').str.replace('_2 ', '_').str.replace('_3 ', '_').replace('ImCoh_Pearson_2', 'Pearson_ImCoh_2')

## Different validation for big5 on best models

In [5]:
results_table = []

In [6]:
categ_col_names = ['Video_arousal', 'Video_valence', 'Video_neutral', 
                   'Video_disgust', 'Video_happiness', 'Video_surprise',
                    'Video_anger', 'Video_fear', 'Video_sadness']
X_video_info = df[['Video_arousal', 'Video_valence', 
                   'Video_dominance', 'Video_liking', 'Video_familiarity', 
                   'Video_neutral', 'Video_disgust', 'Video_happiness', 
                   'Video_surprise', 'Video_anger', 'Video_fear', 'Video_sadness']]
X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)
    
X_spectral_features = df[[x for x in df.columns if 'dwt_emb_' in x]]

features_lines = {}
func_features_dict = {}
    
for fp in ['im_coh_2', 'im_coh_3', 'pearson_corr_2', 'pearson_corr_3']:
    print(fp)
    if fp not in features_lines.keys():
        features_lines[fp] = []
        for matrix in df[fp]:
            matrix_min = np.apply_along_axis(min, 0, matrix)
            matrix_sum = np.apply_along_axis(sum, 0, matrix)
            matrix_mean = np.apply_along_axis(np.mean, 0, matrix)
            func_features = np.concatenate([matrix_sum[np.triu_indices(14, k = 1)], 
                                        matrix_mean[np.triu_indices(14, k = 1)],
                                    bct.centrality.betweenness_wei(bct.weight_conversion(matrix_min, wcm='lengths')),
                                    bct.centrality.betweenness_wei(bct.weight_conversion(matrix_mean, wcm='lengths'))])
            features_lines[fp].append(func_features)

im_coh_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_video_info[categ_col_names] = X_video_info[categ_col_names].astype(int)


im_coh_3
pearson_corr_2
pearson_corr_3


In [7]:
for model_params in [('info_spectr_func', ['im_coh_2']),
                    ('info_spectr_func', ['im_coh_3']),
                    ('info_spectr_func', ['im_coh_2','pearson_corr_2']),
                    ('info_spectr_func', ['pearson_corr_3','im_coh_3']),
                    ('spectr_func', ['im_coh_2']),
                    ('spectr_func', ['im_coh_3']),
                    ('spectr_func', ['pearson_corr_3']),
                    ('func', ['im_coh_2']),
                    ('func', ['im_coh_3']),
                    ('func', ['pearson_corr_2']),
                    ('func', ['im_coh_2','pearson_corr_2']),
                    ('info_func', ['im_coh_2']),
                    ('info_func', ['im_coh_3']),
                    ('info_func', ['im_coh_2','pearson_corr_2']),
                    ('info_func', ['pearson_corr_3','im_coh_3'])]:
#     if model_params[0] == 'info_func' and len(model_params[1]) == 1:
#         if model_params[1][0] in ['pearson_corr_2', 'pearson_corr_3']:
#             continue
            
#     if model_params[0] == 'info_spectr_func' and len(model_params[1]) == 1:
#         if model_params[1][0] in ['pearson_corr_2', 'pearson_corr_3']:
#             continue
            
#     if model_params[0] == 'spectr_func' and len(model_params[1]) == 1:
#         if model_params[1][0] in ['pearson_corr_2']:
#             continue
            
#     if model_params[0] == 'func' and len(model_params[1]) == 1:
#         if model_params[1][0] in ['pearson_corr_3']:
#             continue
            
#     if model_params[0] == 'func' and len(model_params[1]) == 2:
#         if model_params[1][0] == 'pearson_corr_3' and model_params[1][1] == 'im_coh_3':
#             continue
            
    print(model_params[0])
    
    if 'info' in model_params[0]:
        X = X_video_info
        categ_col_ids = [X.columns.tolist().index(ccn) for ccn in categ_col_names]
    else:
        X = None
        
    if 'spectr' in model_params[0]:
        X = pd.concat([X, X_spectral_features], axis=1)
        
    if 'func' in model_params[0]:
        X_func_features = pd.DataFrame(np.hstack([np.vstack(features_lines[fp]) for fp in model_params[1]]))
        X_func_features.columns = ['func_{}'.format(i) for i in range(X_func_features.shape[1])]
        X = pd.concat([X, X_func_features], axis=1)
    
    for target in big5_params:
        print(target)

        y_pred_scores = []
        y_true = []
        
        for test_user in df.UserID.unique():
            print(test_user)
            X_train = X[X.index.isin(df[df.UserID != test_user].index)]
            X_test = X[X.index.isin(df[df.UserID == test_user].index)]
            y_train = (df[df.UserID != test_user][target] > df[df.UserID != test_user][target].median()).astype(int)
            y_test = (df[df.UserID == test_user][target] > df[df.UserID != test_user][target].median()).astype(int)
            
            clf = CatBoostClassifier()
            if 'info' in model_params[0]:
                clf.fit(X_train, y_train, use_best_model=True, cat_features=categ_col_ids,
                            eval_set=(X_test, y_test), verbose=0)
            else:
                clf.fit(X_train, y_train, use_best_model=True,
                            eval_set=(X_test, y_test), verbose=0)
                
            y_pred_scores.append(clf.predict(X_test).sum() / y_test.shape[0])
            y_true.append(y_test.unique()[0])
            
        metrics_prec = precision_score(y_true, [int(x > 0.5) for x in y_pred_scores])
        metrics_recall = recall_score(y_true, [int(x > 0.5) for x in y_pred_scores])
        metrics_f1 = f1_score(y_true, [int(x > 0.5) for x in y_pred_scores])
        metrics_acc = accuracy_score(y_true, [int(x > 0.5) for x in y_pred_scores])
            
        results_table.append((target, model_params[0], ' '.join(model_params[1]), 
                              metrics_prec, metrics_recall, 
                              metrics_f1, metrics_acc))
        print('==')
        with open('results_big5_w_acc_next_step.pkl', 'wb') as f:
            pickle.dump(results_table, f)

info_spectr_func
Extroversion
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Agreeableness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Conscientiousness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Emotional Stability
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Creativity (openness)
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
info_spectr_func
Extroversion
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Agreeableness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Conscientiousness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
2

  _warn_prf(average, modifier, msg_start, len(result))


==
spectr_func
Extroversion
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Agreeableness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Conscientiousness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Emotional Stability
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Creativity (openness)
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40


  _warn_prf(average, modifier, msg_start, len(result))


==
spectr_func
Extroversion
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Agreeableness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Conscientiousness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Emotional Stability
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Creativity (openness)
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
func
Extroversion
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Agreeableness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
==
Conscientiousness
1
2
3
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
