In [61]:
# data science
import pandas as pd
import numpy as np

# machine learning
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn import preprocessing
from sklearn import tree

# viz
import graphviz


# 0. Read csv and clean data

In [63]:
df = pd.read_csv("./EEG_data.csv")
df

Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
0,0.0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,45097.0,33228.0,8293.0,0.0,0.0
1,0.0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,3687.0,5293.0,2740.0,0.0,0.0
2,0.0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,130536.0,57243.0,25354.0,0.0,0.0
3,0.0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,62462.0,49960.0,33932.0,0.0,0.0
4,0.0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,99603.0,44790.0,29749.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12806,9.0,9.0,64.0,38.0,-39.0,127574.0,9951.0,709.0,21732.0,3872.0,39728.0,2598.0,960.0,1.0,0.0
12807,9.0,9.0,61.0,35.0,-275.0,323061.0,797464.0,153171.0,145805.0,39829.0,571280.0,36574.0,10010.0,1.0,0.0
12808,9.0,9.0,60.0,29.0,-426.0,680989.0,154296.0,40068.0,39122.0,10966.0,26975.0,20427.0,2024.0,1.0,0.0
12809,9.0,9.0,60.0,29.0,-84.0,366269.0,27346.0,11444.0,9932.0,1939.0,3283.0,12323.0,1764.0,1.0,0.0


In [74]:
print(df['predefinedlabel']. value_counts())
for i in df['predefinedlabel'].value_counts():
    print(i/sum(df['predefinedlabel']. value_counts()))

0.0    6662
1.0    6149
Name: predefinedlabel, dtype: int64
0.5200218562173132
0.4799781437826868


In [75]:
print(df['user-definedlabeln']. value_counts())
for i in df['user-definedlabeln'].value_counts():
    print(i/sum(df['user-definedlabeln']. value_counts()))

1.0    6567
0.0    6244
Name: user-definedlabeln, dtype: int64
0.5126063539146046
0.48739364608539537


### Cleaning

In [53]:
df = df.drop(["SubjectID", "VideoID", "user-definedlabeln"], axis=1)
df = df[df["predefinedlabel"].isin([0, 1])]
for col in df.columns:
    df[col] = (df[col] - df[col].mean()) / df[col].std()    
df


Unnamed: 0,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel
0,0.634309,-0.184616,0.355280,-0.476492,-0.317205,-0.105609,-0.157636,0.094520,0.087935,0.045542,-0.169916,-0.960689
1,-0.056747,-0.537724,-0.193288,-0.834345,-0.573330,-0.551497,-0.530633,-0.562078,-0.435804,-0.304405,-0.324015,-0.960689
2,0.245590,0.036076,0.059254,0.239276,0.883498,2.217491,0.495997,0.312004,1.168538,0.346382,0.303537,-0.960689
3,0.245590,0.433323,-0.118027,2.205776,-0.158530,0.274077,-0.276081,-0.334301,0.307561,0.255147,0.541582,-0.960689
4,0.116017,0.256769,-0.123044,0.626325,0.763003,-0.059123,0.955131,0.546868,0.777308,0.190381,0.425501,-0.960689
...,...,...,...,...,...,...,...,...,...,...,...,...
12806,0.979837,-0.405308,-0.174891,-0.749990,-0.647600,-0.561575,-0.196374,-0.532739,0.020030,-0.338166,-0.373412,1.040838
12807,0.850264,-0.537724,-0.569592,-0.443403,2.578133,1.543358,1.931293,0.404136,6.742915,0.087458,-0.122268,1.040838
12808,0.807073,-0.802554,-0.822133,0.117944,-0.056348,-0.018174,0.101838,-0.347902,-0.141266,-0.114819,-0.343885,1.040838
12809,0.807073,-0.802554,-0.250151,-0.375639,-0.576349,-0.413365,-0.398727,-0.583105,-0.440914,-0.216339,-0.351100,1.040838


### Preprocessing

In [54]:
# Train : Test = 8:2
n_objects = df.shape[0]
n_train = int(n_objects * 0.8)
df_train = df.iloc[:n_train]
df_test = df.iloc[n_train:]
df_train

Unnamed: 0,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel
0,0.634309,-0.184616,0.355280,-0.476492,-0.317205,-0.105609,-0.157636,0.094520,0.087935,0.045542,-0.169916,-0.960689
1,-0.056747,-0.537724,-0.193288,-0.834345,-0.573330,-0.551497,-0.530633,-0.562078,-0.435804,-0.304405,-0.324015,-0.960689
2,0.245590,0.036076,0.059254,0.239276,0.883498,2.217491,0.495997,0.312004,1.168538,0.346382,0.303537,-0.960689
3,0.245590,0.433323,-0.118027,2.205776,-0.158530,0.274077,-0.276081,-0.334301,0.307561,0.255147,0.541582,-0.960689
4,0.116017,0.256769,-0.123044,0.626325,0.763003,-0.059123,0.955131,0.546868,0.777308,0.190381,0.425501,-0.960689
...,...,...,...,...,...,...,...,...,...,...,...,...
10243,1.454939,0.036076,-0.042766,-0.942350,-0.669977,-0.490639,-0.534989,-0.321899,-0.242738,-0.212744,-0.058276,1.040838
10244,1.670894,0.256769,-0.168201,-0.938198,-0.623433,-0.516581,-0.458815,-0.295609,-0.299159,-0.289836,-0.242873,1.040838
10245,1.800467,-0.184616,-0.151476,-0.938197,-0.621205,-0.495070,-0.482463,-0.165227,-0.288181,-0.212230,-0.222781,1.040838
10246,1.670894,-0.272893,-0.094612,-0.938799,-0.626255,-0.533369,-0.490608,-0.376224,-0.149841,-0.326942,-0.214761,1.040838


In [55]:
def encode(df_to_encode):
    le = preprocessing.LabelEncoder()

    df_encoded = df_to_encode.copy()
    df_encoded["Attention"] = le.fit_transform(df_encoded["Attention"])
    df_encoded["Mediation"] = le.fit_transform(df_encoded["Mediation"])
    df_encoded["Delta"] = le.fit_transform(df_encoded["Delta"])
    df_encoded["Theta"] = le.fit_transform(df_encoded["Theta"])
    df_encoded["Alpha1"] = le.fit_transform(df_encoded["Alpha1"])
    df_encoded["Alpha2"] = le.fit_transform(df_encoded["Alpha2"])
    df_encoded["Beta1"] = le.fit_transform(df_encoded["Beta1"])
    df_encoded["Beta2"] = le.fit_transform(df_encoded["Beta2"])
    df_encoded["Gamma1"] = le.fit_transform(df_encoded["Gamma1"])
    df_encoded["Gamma2"] = le.fit_transform(df_encoded["Gamma2"])
    df_encoded["Raw"] = le.fit_transform(df_encoded["Raw"])
    df_encoded["predefinedlabel"] = le.fit_transform(df_encoded["predefinedlabel"])

    x_encoded = df_encoded[["Attention", "Mediation", "Raw", "Delta", "Theta", "Alpha1", "Alpha2", "Beta1", "Beta2", "Gamma1", "Gamma2"]]
    y_encoded = df_encoded["predefinedlabel"]
    
    return (x_encoded, y_encoded)

# 1. KNN

In [56]:
x_train, y_train = encode(df_train)
x_test, y_test = encode(df_test)

clf = KNeighborsClassifier()        # n_neighbors=5
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Predicted:", y_pred)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Predicted: [1 1 0 ... 0 1 0]
Accuracy: 0.5009754194303551


# 2. Naive Bayes

In [57]:
clf = CategoricalNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Predicted:", y_pred)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

y_pred_bayes = y_pred

Predicted: [0 1 0 ... 0 1 0]
Accuracy: 0.4795161919625439


# 3. Decision Tree

In [60]:
# ID3
# "entropy" for the information gain
clf = tree.DecisionTreeClassifier(max_depth=9)
clf.fit(x_train, y_train)
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=["Attention", "Mediation", "Raw", "Delta", "Theta", "Alpha1", "Alpha2", "Beta1", "Beta2", "Gamma1", "Gamma2"],
                                class_names="Confusion",
                                filled=True, rounded=True,
                                special_characters=True)  
# graph = graphviz.Source(dot_data)
# graph
y_pred = clf.predict(x_test)
print("Predicted:", y_pred)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Predicted: [0 0 0 ... 0 0 0]
Accuracy: 0.5505267264923918
