In [101]:
import pandas as pd

In [102]:
#Dataset details: https://archive.ics.uci.edu/ml/datasets/Epileptic+Seizure+Recognition

In [103]:
d = pd.read_csv('initialData.csv')

#Removing the useless patient IDs 
d.pop('Unnamed: 0')

0        X21.V1.791
1        X15.V1.924
2           X8.V1.1
3         X16.V1.60
4         X20.V1.54
            ...    
11495    X22.V1.114
11496    X19.V1.354
11497      X8.V1.28
11498    X10.V1.932
11499    X16.V1.210
Name: Unnamed: 0, Length: 11500, dtype: object

In [104]:
#y=1 means patient is having seizure. Any other number means that they're not having a seizure (see the dataset description for more details)
#So to make this a binary problem I'll turn every non-1 value into a 0 

d['class'] = d.y==1  #note that d.y==1 is a elementwise boolean comparison 
d['class'] = d['class'].astype(int) #converts the booleans made in the line above to ints 

#Removing the original y
#Note that .pop('y') is the same as .drop('y', axis=1,inplace=True) but is more concise :)
d.pop('y')
print()




In [105]:
#dataset now looks like: 
d

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,class
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,0
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,1
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,0
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,0
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,-22,-22,-23,-26,-36,-42,-45,-42,-45,-49,...,15,16,12,5,-1,-18,-37,-47,-48,0
11496,-47,-11,28,77,141,211,246,240,193,136,...,-65,-33,-7,14,27,48,77,117,170,1
11497,14,6,-13,-16,10,26,27,-9,4,14,...,-65,-48,-61,-62,-67,-30,-2,-1,-8,0
11498,-40,-25,-9,-12,-2,12,7,19,22,29,...,121,135,148,143,116,86,68,59,55,0


In [106]:
#Usually I'd do d.describe(), but doesn't make much sense here since features are not interpretable (by us, at least) 

In [107]:
#But one thing we can look at is the number of positive examples: 
sum(x for x in d['class']) #works because x is either 1 or 0 

2300

In [108]:
#So there are  2300 1s (and thus there are 9200 0s)
#which means 20% (since 2300/11500=0.2) of the dataset is positive
#Maybe we can use SMOTE as we did in assignment 2 to balance the data?

In [109]:
y = d['class'] #sets y to be class column 
X = d.iloc[:,0:(d.shape[1]-1)] #sets X to be dataset with class column removed 

In [110]:
#double checking that X does not have the label column (leaving the labels as a feature is a common mistake): 
X

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X169,X170,X171,X172,X173,X174,X175,X176,X177,X178
0,135,190,229,223,192,125,55,-9,-33,-38,...,8,-17,-15,-31,-77,-103,-127,-116,-83,-51
1,386,382,356,331,320,315,307,272,244,232,...,168,164,150,146,152,157,156,154,143,129
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,29,57,64,48,19,-12,-30,-35,-35,-36
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-80,-82,-81,-80,-77,-85,-77,-72,-69,-65
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,10,4,2,-12,-32,-41,-65,-83,-89,-73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,-22,-22,-23,-26,-36,-42,-45,-42,-45,-49,...,20,15,16,12,5,-1,-18,-37,-47,-48
11496,-47,-11,28,77,141,211,246,240,193,136,...,-94,-65,-33,-7,14,27,48,77,117,170
11497,14,6,-13,-16,10,26,27,-9,4,14,...,-42,-65,-48,-61,-62,-67,-30,-2,-1,-8
11498,-40,-25,-9,-12,-2,12,7,19,22,29,...,114,121,135,148,143,116,86,68,59,55


-------------------

In [111]:
#WARNING: MAKING X AND Y SMALLER FOR NOW 
#We'll want to use the full dataset when reporting final numbers
#I'd be nice if we could do 5 fold CV, but that will take a very very long time if we're using the full dataset

In [112]:
X = X.iloc[0:2000,:]

In [113]:
y = y.iloc[0:2000]

In [114]:
print(X.shape)
print(y.shape)

(2000, 178)
(2000,)


------------

In [115]:
#Train test split 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

In [116]:
##SVM 
from sklearn import svm
modelSVM = svm.SVC(gamma=0.001, C=100.) 
modelSVM.fit(X_train, y_train)
print(classification_report(y_test,modelSVM.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89       323
           1       0.00      0.00      0.00        77

    accuracy                           0.81       400
   macro avg       0.40      0.50      0.45       400
weighted avg       0.65      0.81      0.72       400



  _warn_prf(average, modifier, msg_start, len(result))


In [117]:
#80% might seem  pretty bad considering the fact that we only have two classes 
#and one of them makes up 80% of the dataset...
#However, the precision and recall and f-score aren't 0, which means 
#the model is still learning at least.
#Still, everything is quite low. 

In [118]:
#I proceed to try some more simple non-deep learning models: 

In [119]:
##Logistic regression: 
from sklearn import linear_model
modelLR = linear_model.LogisticRegression(C=1e5, max_iter=1000)    
modelLR.fit(X_train, y_train)
print(classification_report(y_test, modelLR.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89       323
           1       0.55      0.21      0.30        77

    accuracy                           0.81       400
   macro avg       0.69      0.58      0.60       400
weighted avg       0.78      0.81      0.78       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [120]:
##Random forests
from sklearn.ensemble import RandomForestClassifier
modelRF=RandomForestClassifier(n_estimators=1000)   
modelRF.fit(X_train, y_train)
print(classification_report(y_test, modelRF.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       323
           1       0.90      0.90      0.90        77

    accuracy                           0.96       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.96      0.96      0.96       400



In [121]:
##Neural network: 
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

scaler = StandardScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

modelNeuralNet = MLPClassifier(hidden_layer_sizes=(5)) 
modelNeuralNet.fit(X_train, y_train)
print(classification_report(y_test,modelNeuralNet.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       323
           1       0.90      0.74      0.81        77

    accuracy                           0.94       400
   macro avg       0.92      0.86      0.89       400
weighted avg       0.93      0.94      0.93       400





In [122]:
#Definitely room for improvement on all 4 baselines.
#I tried out some other models. kNN performs similarly to SVM and logistic regression, 
#and adaboost and GBMs perform similarly to the random forest. 
#But to keep our presentation from getting cluttered, let's just stick to these
#4 baseline models 