In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

# Initial work

In [None]:
eeg = pd.read_csv("EEG_data.csv")
eeg

In [None]:
nans = np.isnan(eeg).values.sum() 
print("Number of nan values: ", nans)

infs = np.isinf(eeg).values.sum()
print("Number of inf values: ", infs)

# Data processing

## Distributions

In [None]:
plt.hist(eeg["predefinedlabel"], bins=[0,0.5,1], density=True, edgecolor='k')
plt.title("Predefined labels")
plt.show()

plt.hist(eeg["user-definedlabeln"], bins=[0,0.5,1], density=True, edgecolor='k')
plt.title("User defined labels")
plt.show()

### Subject level

In [None]:
plt.hist(eeg["Attention"], bins='auto', density=True)
plt.title("Attention")
plt.xlabel("Attention scores")
plt.show()

plt.hist(eeg["Mediation"], bins='auto', density=True)
plt.title("Mediation")
plt.xlabel("Mediation scores")
plt.show()

In [None]:
_, binz, _ = plt.hist(eeg["Delta"], bins="auto", density=True)
plt.title("Delta distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["SubjectID"] == subj_id]["Delta"], bins=binz, density=True)
    plt.title("Delta distribution for Subject " + str(subj_id))
    plt.show()

In [None]:
_, binz, _ = plt.hist(eeg["Theta"], bins="auto", density=True)
plt.title("Theta distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["SubjectID"] == subj_id]["Theta"], bins=binz, density=True)
    plt.title("Theta distribution for Subject " + str(subj_id))
    plt.show()

In [None]:
_, binz, _ = plt.hist(eeg["Alpha1"], bins="auto", density=True)
plt.title("Alpha1 distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["SubjectID"] == subj_id]["Alpha1"], bins=binz, density=True)
    plt.title("Alpha1 distribution for Subject " + str(subj_id))
    plt.show()

In [None]:
_, binz, _ = plt.hist(eeg["Alpha2"], bins="auto", density=True)
plt.title("Alpha2 distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["SubjectID"] == subj_id]["Alpha2"], bins=binz, density=True)
    plt.title("Alpha2 distribution for Subject " + str(subj_id))
    plt.show()

In [None]:
_, binz, _ = plt.hist(eeg["Raw"], bins="auto", density=True)
plt.title("Raw distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["SubjectID"] == subj_id]["Raw"], bins=binz, density=True)
    plt.title("Raw distribution for Subject " + str(subj_id))
    plt.show()

In [None]:
print("Variance of Raw Feature",np.var(eeg["Raw"]))
print("Variance of Raw for subject 2", np.var(eeg[eeg["SubjectID"] == 2.0]["Raw"]))
print("Variance of Raw for subject 6", np.var(eeg[eeg["SubjectID"] == 6.0]["Raw"]))

### Video level

In [None]:
_, binz, _ = plt.hist(eeg["Raw"], bins="auto", density=True)
plt.title("Raw distribution")
plt.show()

for i in range(10):
    subj_id = float(i)
    plt.hist(eeg[eeg["VideoID"] == subj_id]["Raw"], bins=binz, density=True)
    plt.title("Raw distribution for Video " + str(subj_id))
    plt.show()

The findings show that subject two has values clustered towards 0 while subject 6 is very spread.\
There are many slight differences between the distributions per video, pointing at some relationship with the labels but not strong enough to make immeadiate conclusions.

## Correlation

In [None]:
df = eeg.iloc[:,2:]
corrmap = df.corr()
sns.heatmap(corrmap, cmap="Purples")

It makes sense that the more similar frequency bands are the more correlated they are.\
The only attributes that have a strong relationship with the labels are Delta-Beta1.