In [1]:
# IMPORT LIBRARIES
from scipy.io import arff
import pandas as pd

# DOWNLOAD THE DATASET AND PERFORM CLEANING
data = arff.loadarff('EEG Eye State.arff')
df = pd.DataFrame(data[0])
df['eyeDetection'] = df['eyeDetection'].replace({b'0': 0, b'1': 1})
df

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.90,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.10,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.90,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.90,4627.69,4210.77,4244.10,4212.82,4288.21,4632.82,4398.46,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,4281.03,3990.26,4245.64,4116.92,4333.85,4614.36,4074.87,4625.64,4203.08,4221.54,4171.28,4269.23,4593.33,4340.51,1
14976,4276.92,3991.79,4245.13,4110.77,4332.82,4615.38,4073.33,4621.54,4194.36,4217.44,4162.56,4259.49,4590.26,4333.33,1
14977,4277.44,3990.77,4246.67,4113.85,4333.33,4615.38,4072.82,4623.59,4193.33,4212.82,4160.51,4257.95,4591.79,4339.49,1
14978,4284.62,3991.79,4251.28,4122.05,4334.36,4616.41,4080.51,4628.72,4200.00,4220.00,4165.64,4267.18,4596.41,4350.77,1


In [2]:
# CHECK IF DATASET IS BALANCED
df['eyeDetection'].value_counts()

0    8257
1    6723
Name: eyeDetection, dtype: int64

In [3]:
# CHECK IF DATATYPES ARE CORRECT
df.dtypes

AF3             float64
F7              float64
F3              float64
FC5             float64
T7              float64
P7              float64
O1              float64
O2              float64
P8              float64
T8              float64
FC6             float64
F4              float64
F8              float64
AF4             float64
eyeDetection      int64
dtype: object

In [4]:
# SPLIT DATASET INTO TRAINING AND TEST SETS
from sklearn.model_selection import train_test_split
X = df.drop('eyeDetection', axis=1)
y = df.eyeDetection
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
training = X_train
training['eyeDetection'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [5]:
print(y.value_counts())

0    8257
1    6723
Name: eyeDetection, dtype: int64


In [6]:
# CHECK THAT DATASETS ARE BALANCED
print(y_train.value_counts())
print(y_test.value_counts())

0    5780
1    4706
Name: eyeDetection, dtype: int64
0    2477
1    2017
Name: eyeDetection, dtype: int64


In [7]:
# UPSAMPLE THE MINORITY LABEL
from sklearn.utils import resample
df_majority = training[training.eyeDetection==0]
df_minority = training[training.eyeDetection==1]
# df_majority.shape[0]
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.eyeDetection.value_counts()

1    5780
0    5780
Name: eyeDetection, dtype: int64

In [8]:
# SET X AND Y
X = df_upsampled.drop('eyeDetection', axis=1)
y = df_upsampled.eyeDetection

In [None]:
# BASELINE MODEL
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression().fit(X_train, y_train)
# predictions = model.predict(X_test)

In [9]:
# IMPROVED MODEL
from sklearn.ensemble import RandomForestClassifier
fitted_model = RandomForestClassifier(class_weight='balanced').fit(X, y)
predictions = fitted_model.predict(X_test)

In [10]:
# MODEL OUTPUT
predictions

array([1, 0, 0, ..., 1, 1, 1])

In [11]:
# 1ST PERFORMANCE METRIC
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.9216733422340899


In [14]:
# FEATURE SELECTION
from sklearn.feature_selection import RFE
forest = RandomForestClassifier(class_weight='balanced')
rfe = RFE(forest)
rfe = rfe.fit(X_train.drop('eyeDetection', axis=1), y_train.values.ravel())
print(X_train.drop('eyeDetection', axis=1).columns)
print(rfe.support_)
print(rfe.ranking_)

Index(['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6',
       'F4', 'F8', 'AF4'],
      dtype='object')
[ True  True False False False  True  True False False False  True False
  True  True]
[1 1 7 3 6 1 1 5 8 4 1 2 1 1]


In [15]:
X_finaltrain = training[['AF3', 'F7', 'P7', 'O1', 'FC6', 'F8', 'AF4']]
y_finaltrain = training.eyeDetection
X_finaltest = X_test[['AF3', 'F7', 'P7', 'O1', 'FC6', 'F8', 'AF4']]
y_finaltest = y_test

final_model = RandomForestClassifier(class_weight='balanced').fit(X_finaltrain, y_finaltrain)
predictions = final_model.predict(X_finaltest)

In [16]:
print(accuracy_score(y_finaltest, predictions))

0.8909657320872274


In [None]:
---------------------------------------------------------------------------------------------------------

In [None]:
# RESAMPLING (doesn't work because library is not installed)
from imblearn.over_sampling import SMOTE
smote = SMOTE()#random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)#, random_state=0)
columns = X_train.columns
new_data_X, new_data_y = smote.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=new_data_X, columns=columns)
os_data_y= pd.DataFrame(data=new_data_y, columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ", len(os_data_X))
print("Number of no subscription in oversampled data", len(os_data_y[os_data_y['y']==0]))
print("Number of subscription", len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ", len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ", len(os_data_y[os_data_y['y']==1])/len(os_data_X))