Script 2. Classification using raw data as features.

In [1]:
import pandas as pd


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report

from target_to_binary import is_seizure

In [2]:
file_path  = '../dat/Epileptic_Seizure_Recognition.csv'
data = pd.read_csv(file_path)
print(data.shape)  # (11500, 180)
del file_path

# remove the 1st column (Unnamed)
data.drop(columns=[list(data)[0]], inplace=True)

(11500, 180)


In [3]:
# group all classes >1 (healthy) together into new class 0
target = list(data)[-1]  # "y"

features = list(data)[0:-1]
print(len(features))  # 178

data[target] = data[target].apply(is_seizure)

data.head()

178


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,0
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,1
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,0
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,0
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,0


train-test split

In [4]:
data_train, data_test = train_test_split(data, test_size=0.2, stratify=data[target], random_state=42)

print(data_train.shape)  # (9200, 179)
print(data_test.shape)   # (2300, 179)

# Testing. How many rows with y==1 in the test set?
df_seizures_test = data_test.loc[data_test[target] == 1]
print(df_seizures_test.shape)  # (460, 179) , where 460==2300/5
df_seizures_test.head()  # all 5 rows are with y==1 , OK

del df_seizures_test

(9200, 179)
(2300, 179)
(460, 179)


Train a Random-Forest classifier

In [5]:
# Train the classifier. The depth of trees is bounded in order to avoid overfitting.
# 2.6s
clf = RandomForestClassifier(n_estimators=50, max_depth=15, random_state=1)
clf.fit(data_train[features], data_train[target])

Check its performance on training set

In [6]:
preds_train = clf.predict(data_train[features])
print(classification_report(data_train[target], preds_train))

# f1-score for class 1 on train data : 0.98

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7360
           1       1.00      0.96      0.98      1840

    accuracy                           0.99      9200
   macro avg       0.99      0.98      0.99      9200
weighted avg       0.99      0.99      0.99      9200



Check its performance on test set

In [7]:
preds_test = clf.predict(data_test[features])
print(classification_report(data_test[target], preds_test))

# f1-score for class 1 on test data: 0.91

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1840
           1       0.97      0.86      0.91       460

    accuracy                           0.97      2300
   macro avg       0.97      0.92      0.94      2300
weighted avg       0.97      0.97      0.96      2300



The classification performance using raw data is (surprisingly) quite high! This may come from the classes being visually well separated.