# EEG Eye State detection

## Authors: Ángel López Manríquez

### Objectives:
### What is done in the Notebook:

In [58]:

import arff

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from IPython.display import display, HTML, Markdown

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  # Support vector classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from matplotlib.colors import ListedColormap


## Reading the dataset

In [50]:


filename = 'EEG Eye State.arff'
file = open(filename)  # no mode='rb' supplied
dataset = arff.load(file)

print('type: ', type(dataset))
print('attributes: ', dataset.keys())

print('description: ', dataset['description'], '\n')
print('relation: ', dataset['relation'], '\n')
print('attributes: ', dataset['attributes'], '\n')
print('data (first 5 samples): ', '\n')

pd.DataFrame(dataset['data'][:5], columns=[x[0] for x in dataset['attributes']])

type:  <class 'dict'>
attributes:  dict_keys(['description', 'relation', 'attributes', 'data'])
description:   

relation:  EEG_DATA 

attributes:  [('AF3', 'NUMERIC'), ('F7', 'NUMERIC'), ('F3', 'NUMERIC'), ('FC5', 'NUMERIC'), ('T7', 'NUMERIC'), ('P7', 'NUMERIC'), ('O1', 'NUMERIC'), ('O2', 'NUMERIC'), ('P8', 'NUMERIC'), ('T8', 'NUMERIC'), ('FC6', 'NUMERIC'), ('F4', 'NUMERIC'), ('F8', 'NUMERIC'), ('AF4', 'NUMERIC'), ('eyeDetection', ['0', '1'])] 

data (first 5 samples):  



Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0


## Preprocessing the data

In [3]:
# Set each data[:][-1] as int instead of str. 
# If there's one str in the list passed to np.array ALL the elements in np.array will be of type str
data = dataset['data']
m, n = len(data), len(data[0])
for i, row in enumerate(data):
    data[i][n - 1] = int(row[n - 1]) 

# Another way to perform the previous task
# data = np.array(dataset['data']).astype('float64')
# data

In [37]:
data = np.array(data)
X = data[:, :-1]
y = data[:, -1]

# sc = StandardScaler()
# X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Testing the classifiers
We'll determine which is the most suitable model for our dataset.

In [90]:
def display_markdown(txt):
    display(Markdown(txt))

def classifier_summary(clf, name, X_train, X_test, y_train, y_test, latex=False):
    '''clf: Classifier '''
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    display_markdown(f'## {name}')
    print(accuracy_score(y_test, y_pred))
    cm = pd.crosstab(y_test, y_pred)
    if latex: print(cm.to_latex())
    display(cm)
    
    

In [91]:
class MyLogisticRegression(object):
    def __init__(self, eta=0.01, n_iter=1000, random_state=0):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state
        self.rgen = np.random.RandomState(self.random_state)
        self.w = None
        self.b = None
        
    def net_input(self, X):
        return np.dot(X, self.w) + self.b
    
    def activation(self, z):
        return 1. / (1. + np.exp(-np.clip(z, -25, 25)))
        
    def fit(self, X, y):        
        self.w = self.rgen.normal(loc=0, scale=0.01, size=X.shape[1])
        self.b = self.rgen.normal(loc=0, scale=0.01, size=1)
        self.cost = []
        
        for i in range(self.n_iter):
            net_input = self.net_input(X)
            output = self.activation(net_input)
            errors = y-output                                # The error is computed as the difference between the 
                                                            # prob. of the class and the prediction of the model
            
            self.w += self.eta * X.T.dot(errors)
            self.b = self.eta * errors.sum()
            
            cost = (-y.dot(np.log(output)) - ((1-y).dot(np.log(1-output))))
            #print(i,cost,errors.sum())
            self.cost.append(cost)
            
        return self
    
    def predict(self, X):
        predicted_class = self.predict_proba(X) > 0.5
        return predicted_class  # Given the features "predict" outputs the classification given by the model
    
    def predict_proba(self, X):
        predicted_proba = self.activation(self.net_input(X))
        return predicted_proba  # Given the features predict_proba outputs the probability that the solution belongs to the class

In [92]:

lg = LogisticRegression(random_state=0)
mlr = MyLogisticRegression(random_state=0)
gnb = GaussianNB()
knc = KNeighborsClassifier(n_neighbors=3)
svc = SVC(random_state=0)

dtc = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=0)

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier

# (svc,'Support vector classifier'),


In [93]:
for clf, name in ((lg,'Logistic "Regression"'), (gnb,'Naive bayes (Gaussian)'), 
                  (knc,'KNeighbors classifier'), 
                  (dtc,'Decision Tree Classifier'), (rfc, 'Random forest classifier'), ):
    classifier_summary(clf, name, X_train, X_test, y_train, y_test)              




## Logistic "Regression"

0.6401869158878505


col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1253,372
1.0,706,665


## Naive bayes (Gaussian)

0.5100133511348465


col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,514,1111
1.0,357,1014


## KNeighbors classifier

0.9722963951935915


col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1593,32
1.0,51,1320


## Decision Tree Classifier

0.8140854472630173


col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1360,265
1.0,292,1079




## Random forest classifier

0.8945260347129506


col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1545,80
1.0,236,1135
