In [1]:
## import librraies

from sklearn.linear_model import LinearRegression, Lasso, SGDClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")
import sklearn.cluster as cluster
import warnings 
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [2]:
# Getting data

df = pd.read_csv('/Users/apekshasridhar/Downloads/EEG.machinelearing_data_BRMH.csv')
df.head()
df.shape

(945, 1149)

In [3]:
# Cleaning data

df.columns[df.isna().any()].tolist()

['education', 'IQ', 'Unnamed: 122']

In [4]:
# dropping rows with NaNs
df.shape
df.drop('Unnamed: 122', axis = 1, inplace = True)
df.dropna(inplace = True)
df.head()

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
1,2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,13.425118,11.002916,...,45.595619,17.510824,26.777368,28.201062,57.108861,32.375401,60.351749,13.900981,57.831848,43.463261
2,3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.94178,27.544684,...,99.475453,70.654171,39.131547,69.920996,71.063644,38.534505,69.908764,27.180532,64.803155,31.485799
3,4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,21.496226,21.846832,...,59.986561,63.822201,36.478254,47.117006,84.658376,24.724096,50.299349,35.319695,79.822944,41.141873
4,5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,37.775667,33.607679,...,61.46272,59.166097,51.465531,58.635415,80.685608,62.138436,75.888749,61.003944,87.455509,70.531662
5,6,F,24.0,2012.11.21,14.0,105.0,Addictive disorder,Alcohol use disorder,13.482096,14.095855,...,92.841723,82.302355,83.938567,88.213886,90.972026,77.443894,89.545596,72.57953,89.462863,86.127823


In [5]:
df['main.disorder'].value_counts()

Mood disorder                         262
Addictive disorder                    178
Trauma and stress related disorder    123
Schizophrenia                         117
Anxiety disorder                      106
Healthy control                        93
Obsessive compulsive disorder          40
Name: main.disorder, dtype: int64

In [6]:
# Creating column if Mood Disorder or not 

df['mood_disorder'] = df['main.disorder'] == 'Mood disorder'
df['mood_disorder'].astype(int)

1      0
2      0
3      0
4      0
5      0
      ..
940    0
941    0
942    0
943    0
944    0
Name: mood_disorder, Length: 919, dtype: int64

In [7]:
# Simple cross validation 
# Train test split

df.columns
X = df.drop(['mood_disorder', 'no.', 'sex', 'eeg.date', 'main.disorder', 'specific.disorder'], axis = 1)
y = df['mood_disorder']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 42)

In [8]:
# Grid search CV

#params_dict = {'alpha': [0.1, 1.0, 10.0]}
pipe = Pipeline([
    ('selector', SequentialFeatureSelector(estimator = SGDClassifier(loss="hinge", penalty="l2", max_iter=5),
                                           scoring = 'neg_mean_squared_error',
                                           n_features_to_select = 5)),
    ('model', SGDClassifier(loss="hinge", penalty="l2", max_iter=5))
])


In [9]:
## Fit data 

pipe.fit(X,y)
pipe.named_steps['selector'].get_feature_names_out()
#pd.DataFrame(pipe.fit(X,y), columns = pipe.named_steps['selector'].get_feature_names_out())
# grid = GridSearchCV(estimator = SGDClassifier(loss="hinge", penalty="l2", max_iter=5), param_grid=params_dict)
# grid.fit(X_train, y_train)


array(['age', 'education', 'IQ', 'AB.A.delta.a.FP1', 'AB.A.delta.b.FP2'],
      dtype=object)

In [10]:
## MSE for train and test simple CV

train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)
train_mse = log_loss(y_train, train_preds)
test_mse = log_loss(y_test, test_preds)
print(train_mse)
print(test_mse)

9.507563642144932
10.636942005679014


In [11]:
# Best alpha for simple CV

params_dict = {'model__alpha': [0.01]}
grid = GridSearchCV(pipe, param_grid=params_dict)
#grid.fit(X_train, y_train)

In [12]:
grid.fit(X_train, y_train)

In [14]:
# train_preds = grid.predict(X_train)
# test_preds = grid.predict(X_test)
train_mse = log_loss(y_train, train_preds)
test_mse = log_loss(y_test, test_preds)
print(train_mse)
print(test_mse)

9.507571103396224
10.887243273692901


In [None]:
# k-fold k = 5

params_dict = {'model__alpha': [0.01, 1.0, 10.0]}
grid = GridSearchCV(pipe, param_grid=params_dict, cv = 5)
grid.fit(X_train, y_train)