# IterSHAP on features extracted from DEAP dataset
The features used in this Notebook are extracted from the 'data_preprocessed_matlab' datasets of the DEAP dataset. The files needed to run this Notebook can be created by running 'feature_extraction.m'

Data source: https://www.eecs.qmul.ac.uk/mmv/datasets/deap/

Matlab script origin: https://github.com/Daisybiubiubiu/EEG-Emotion-Recognition/blob/master/CWT/cwt_process.m

## Installation & Imports

In [None]:
# Uncomment the lines below to install the needed packages.
# %pip install pandas
# %pip install matplotlib
# %pip install torch
# %pip install scikit-learn
# %pip install itershap

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from itershap import IterSHAP
from tqdm import tqdm

## Data preprocessing

### Load all the extracted data files and concatenate into one 2D DataFrame

In [None]:
frameNum = 60
channels = 32
participants = 32
videos = 40

col_names_per_channel = ['time_mean', 'time_median', 'time_std', 'time_iqr', 'time_max', 'time_min', 'time_unknown', 
'freq_mean', 'freq_median', 'freq_std', 'freq_iqr', 'freq_max', 'freq_min', 'freq_unknown']

new_cols = ['channel_'+str(i+1)+'_'+col_name for i in range(channels) for col_name in col_names_per_channel]

# load data
dfs = pd.DataFrame(columns=new_cols)
for i in range(1,participants+1):
  for j in range(1,videos+1):
    filename = './new_data/participant%dvideo%d.txt'%(i,j)
    cols = [i for i in range(frameNum)]
    df = pd.read_csv(filename, header = None, usecols = cols, delimiter=',')   
    df = df.transpose()
    # print(new_cols)
    df.columns = new_cols
    # print(df.head())
    dfs = pd.concat([dfs, df])
    
print('dataLoaded:')
print(dfs.shape)
# print(dfs.head())

### Load the labels and make a binary classification

In [None]:
# load label
cols = ['valence', 'arousal', 'dominance', 'liking']
label_df = pd.read_csv('./label.txt',
    usecols = [i for i in range(4)], header=None, delimiter=',' )
print(label_df.shape)
label_df.columns = cols
label_df[label_df<5] = 0
label_df[label_df>=5] = 1


#### Repeat the arousal label 60 times to match DataFrame dimension

In [None]:
# arousal
label = label_df['arousal'].astype(int).values
label = np.repeat(label,60)
print(label.shape)

In [None]:
X = dfs
y = label
X_orig, y_orig = X, y
print(X.shape)
print(y.shape)

## Experiment setup

In [None]:
PERC_OPTIONS = [1.00, 0.50, 0.25, 0.10, 0.05, 0.025, 0.01, 0.005, 0.0025, 0.001]
nr_runs_per_ex = 5
nr_ft_selected = []
accuracies = []
durations = []

for i in tqdm(range(nr_runs_per_ex), desc="Nr. of iterations", position=0):
    for j in tqdm(range(len(PERC_OPTIONS)), desc="Percentage options", position=1, leave=False):
        start_time = time.time()
        PERC = PERC_OPTIONS[j]
        if PERC < 1.00:
            X, X_test, y, y_test = train_test_split(X_orig, y_orig, train_size=PERC, random_state=20)
            X_train, X_unused, y_train, y_unused = train_test_split(X, y, test_size=0.1, random_state=1)
        else:
            X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, test_size=0.1, random_state=1)
        
        # print("Starting with IterSHAP")
        itershap_fs = IterSHAP()
        itershap_fs.fit(X, y)
        X_train = itershap_fs.transform(X_train)
        X_test = itershap_fs.transform(X_test)
        nr_ft_selected.append(X_train.shape[1])
        # print(f"Shape of X_train: {X_train.shape}")
        # print(f"Shape of X_test: {X_test.shape}")

        # print("Starting with training model")
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_pred, y_test)
        # print(accuracy)
        accuracies.append(accuracy)
        end_time = time.time()
        durations.append(end_time-start_time)
        # print("\n")
    print(accuracies)