In [233]:
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression # to apply the Logistic regression
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.cross_validation import KFold # use for cross validation
from sklearn.model_selection import GridSearchCV# for tuning parameter
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm # for Support Vector Machine
from sklearn import metrics # for the check the error and accuracy of the model


In [207]:
# About this Dataset

# Description

# We collected EEG signal data from 10 college students while they watched MOOC video clips. 
# We extracted online education videos that are assumed not to be confusing for college students, such as 
# videos of the introduction of basic algebra or geometry. We also prepare videos that are expected to confuse
# a typical college student if a student is not familiar with the video topics like Quantum Mechanics, and Stem 
# Cell Research. We prepared 20 videos, 10 in each category. Each video was about 2 minutes long. We chopped the
# two-minute clip in the middle of a topic to make the videos more confusing. The students wore a single-channel 
# wireless MindSet that measured activity over the frontal lobe. The MindSet measures the voltage between an 
# electrode resting on the forehead and two electrodes (one ground and one reference) each in contact with an ear.
# After each session, the student rated his/her confusion level on a scale of 1-7, where one corresponded to the 
# least confusing and seven corresponded to the most confusing. These labels if further normalized into labels of 
# whether the students are confused or not. This label is offered as self-labelled confusion in addition to our 
# predefined label of confusion.  

In [208]:
import tensorflow as tf

In [209]:
eeg_dataset = pd.read_csv('/Users/arslanaliawan/Desktop/scripts/EEG/EEG_data.csv')
eeg_dataset.head()

Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
0,0.0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,45097.0,33228.0,8293.0,0.0,0.0
1,0.0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,3687.0,5293.0,2740.0,0.0,0.0
2,0.0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,130536.0,57243.0,25354.0,0.0,0.0
3,0.0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,62462.0,49960.0,33932.0,0.0,0.0
4,0.0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,99603.0,44790.0,29749.0,0.0,0.0


In [210]:
demo_info = pd.read_csv("/Users/arslanaliawan/Desktop/scripts/EEG/demographic_info.csv")

In [211]:
demo_info.head(10)

Unnamed: 0,subject ID,age,ethnicity,gender
0,0,25,Han Chinese,M
1,1,24,Han Chinese,M
2,2,31,English,M
3,3,28,Han Chinese,F
4,4,24,Bengali,M
5,5,24,Han Chinese,M
6,6,24,Han Chinese,M
7,7,25,Han Chinese,M
8,8,25,Han Chinese,M
9,9,24,Han Chinese,F


In [212]:
print(eeg_dataset.columns.values)

['SubjectID' 'VideoID' 'Attention' 'Mediation' 'Raw' 'Delta' 'Theta'
 'Alpha1' 'Alpha2' 'Beta1' 'Beta2' 'Gamma1' 'Gamma2' 'predefinedlabel'
 'user-definedlabeln']


In [213]:
eeg_dataset.shape

(12811, 15)

In [214]:
print(demo_info.columns.values)

['subject ID' ' age' ' ethnicity' ' gender']


In [215]:
demo_info.shape

(10, 4)

In [216]:
for col in list(demo_info.columns):
    if demo_info[col].dtype == 'object':
        dums=pd.get_dummies(demo_info[col])
        demo_info = pd.concat([dums,demo_info], axis=1, join='outer')
        demo_info = demo_info.drop(col, 1)

In [217]:
demo_info.head()

Unnamed: 0,F,M,Bengali,English,Han Chinese,subject ID,age
0,0,1,0,0,1,0,25
1,0,1,0,0,1,1,24
2,0,1,0,1,0,2,31
3,1,0,0,0,1,3,28
4,0,1,1,0,0,4,24


In [218]:
demo_info.columns.values

array(['F', 'M', 'Bengali', 'English', 'Han Chinese', 'subject ID',
       ' age'], dtype=object)

In [219]:
demo_info.rename(columns={'subject ID':'SubjectID'},inplace=True)

In [220]:
merged_data = eeg_dataset.merge(demo_info,on='SubjectID')

In [221]:
merged_data.head()

Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,...,Gamma1,Gamma2,predefinedlabel,user-definedlabeln,F,M,Bengali,English,Han Chinese,age
0,0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,...,33228.0,8293.0,0.0,0.0,0,1,0,0,1,25
1,0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,...,5293.0,2740.0,0.0,0.0,0,1,0,0,1,25
2,0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,...,57243.0,25354.0,0.0,0.0,0,1,0,0,1,25
3,0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,...,49960.0,33932.0,0.0,0.0,0,1,0,0,1,25
4,0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,...,44790.0,29749.0,0.0,0.0,0,1,0,0,1,25


In [222]:
merged_data.rename(columns={'user-definedlabeln':'userdefinedlabeln'},inplace=True)

In [223]:
merged_data.shape

(12811, 21)

In [228]:
prediction_var = ['SubjectID', 'VideoID', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2','Beta1', 'Beta2', 'Gamma1' ,'Gamma2', 'predefinedlabel','F', 'M', 'Bengali', 'English', 'Han Chinese',' age']

In [247]:
train, test = train_test_split(merged_data,test_size = 0.5)
print(train.shape)
print(test.shape)

(6405, 21)
(6406, 21)


In [248]:
train_X = train[prediction_var]
train_y = train.userdefinedlabeln

In [249]:
test_X = test[prediction_var]
test_y = test.userdefinedlabeln

In [250]:

# Random Forest
model=RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_y)
prediction=model.predict(test_X)
metrics.accuracy_score(prediction,test_y)

0.9372463315641586

In [251]:
#Now try with SVM 

model= svm.SVC()
model.fit(train_X,train_y)
prediction=model.predict(test_X)

metrics.accuracy_score(prediction,test_y)

0.5112394630034343

In [252]:
def model(model,merged_data,prediction,outcome):
    kf = KFold(merged_data.shape[0],n_folds=10)

In [253]:
def classification_model(model,merged_data,prediction_input,output):
    model.fit(merged_data[prediction_input],merged_data[output]) 
    predictions = model.predict(merged_data[prediction_input])
    accuracy = metrics.accuracy_score(predictions,merged_data[output])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
 
    
    kf = KFold(merged_data.shape[0], n_folds=5)

    error = []
    for train, test in kf:

        train_X = (merged_data[prediction_input].iloc[train,:])
        train_y = merged_data[output].iloc[train]
        model.fit(train_X, train_y)
    
        
        test_X=merged_data[prediction_input].iloc[test,:]
        test_y=merged_data[output].iloc[test]
        error.append(model.score(test_X,test_y))
        
        print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

In [254]:

model = DecisionTreeClassifier()
outcome_var= "userdefinedlabeln"
classification_model(model,merged_data,prediction_var,outcome_var)



Accuracy : 100.000%
Cross-Validation Score : 55.833%
Cross-Validation Score : 50.009%
Cross-Validation Score : 55.327%
Cross-Validation Score : 53.566%
Cross-Validation Score : 52.564%


In [255]:
model = svm.SVC()
classification_model(model,merged_data,prediction_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 45.611%
Cross-Validation Score : 47.532%
Cross-Validation Score : 44.738%
Cross-Validation Score : 44.629%
Cross-Validation Score : 44.641%


In [256]:
model = RandomForestClassifier()
classification_model(model,merged_data,prediction_var,outcome_var)

Accuracy : 99.797%
Cross-Validation Score : 58.096%
Cross-Validation Score : 53.014%
Cross-Validation Score : 55.340%
Cross-Validation Score : 52.639%
Cross-Validation Score : 50.472%


In [257]:
model = KNeighborsClassifier()
classification_model(model,merged_data,prediction_var,outcome_var)

Accuracy : 71.400%
Cross-Validation Score : 52.282%
Cross-Validation Score : 51.590%
Cross-Validation Score : 52.140%
Cross-Validation Score : 51.966%
Cross-Validation Score : 52.166%


In [258]:
model = LogisticRegression()
classification_model(model,merged_data,prediction_var,outcome_var)

Accuracy : 56.990%
Cross-Validation Score : 48.537%
Cross-Validation Score : 51.337%
Cross-Validation Score : 53.298%
Cross-Validation Score : 54.874%
Cross-Validation Score : 53.642%


In [259]:
model = GradientBoostingClassifier()
classification_model(model,merged_data,prediction_var,outcome_var)

Accuracy : 90.586%
Cross-Validation Score : 57.043%
Cross-Validation Score : 53.072%
Cross-Validation Score : 55.340%
Cross-Validation Score : 53.644%
Cross-Validation Score : 51.291%
