In [2]:
# Mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Install the required libraries
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Input, Dropout, Reshape, AveragePooling1D, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import activations
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import StratifiedKFold

Dataset: 

---


We have used publicly available dataset of EEG signals from the Institute of Psychiatry and Neurology in Warsaw, Poland, which publicly provide a dataset of EEG signals . It consists of data of 14 paranoid SZ and 14 normal subjects. Fifteen minutes of EEG data was recorded for each subject with a sampling frequency of 250Hz using the standard 10-20 system. The electrodes used were Fp1, Fp2, F7, F3, Fz, F4, F8, T3, C3, Cz, C4, T4, T5, P3, Pz, P4, T6, O1, O2.

The code is given below. For different experiments, only you need to load a dataset and change the parameters of CNN according to the one given in paper.

In [5]:
def load_dataset(filename):
  """
  Loads the dataset from the specified file and returns the feature and target variables as separate arrays.
  
  Args:
  - filename (str): The name of the CSV file containing the dataset.
  
  Returns:
  - X (pandas DataFrame): The feature variables.
  - Y (pandas Series): The target variable.
  
  """

  data = pd.read_csv(filename)
  X=data.iloc[:,:-1]
  Y=data.iloc[:,-1]
  return X,Y

def dataPreprocessing(features, labels):
  
  """
  Normalizes the input data by subtracting the mean and dividing by the standard deviation.
  
  Args:
  - features (pandas DataFrame): The feature variables.
  - labels (pandas Series): The target variable.
  
  Returns:
  - normalized_data (numpy array): The normalized feature variables.
  
  """

  df = features.copy()
  for column in df.columns:
    df[column] = (df[column] - df[column].mean()) / df[column].std()
  normalized_data = np.array(df[:])
  return normalized_data

def CNN():
  
  """
  Defines and compiles a Convolutional Neural Network (CNN) model.
  
  Returns:
  - model (tensorflow.keras.models.Sequential): The compiled CNN model.
  
  """
  model = Sequential()
  model.add(Conv1D(8,kernel_size=7,strides=1,padding='same',activation='ReLU',input_shape=(5000,1)))
  model.add(MaxPooling1D(pool_size=4,strides=4))
  model.add(Conv1D(16,kernel_size=8,strides=1,padding='same',activation='ReLU'))
  model.add(MaxPooling1D(pool_size=4,strides=4))
  model.add(Flatten())
  model.add(Dense(50, activation='ReLU'))
  model.add(Dense(20, activation='ReLU'))
  model.add(Dense(2, activation='softmax'))
  
  # Compile model
  optimizer=Adam(learning_rate=0.001)
  model.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
  return model


def train(X, Y, classifier):
  
  """
  Trains and evaluates the specified classifier on the input data using stratified k-fold cross-validation.
  
  Args:
  - X (numpy array): The feature variables.
  - Y (pandas Series): The target variable.
  - classifier (str): The name of the classifier to use. Valid options are: "CNN", "LR" (Logistic Regression),
   "RF" (Random Forest), "SVM" (Support Vector Machine), and "GB" (Gradient Boosting Machine).
  
  Returns:
  - None
  
  """

  #fix random seed for reproducibility
  seed = 7
  np.random.seed(seed)
  kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  fold=1
  accuracy=[]
  for train, test in kfold.split(X, Y):
    print("Training on Fold %s ..."%fold)
    X_train = X[train].reshape( X[train].shape[0],X[train].shape[1],1)
    X_test = X[test].reshape(X[test].shape[0],X[test].shape[1],1)
    Y_train = to_categorical(Y[train])
    Y_test =  to_categorical(Y[test])
    model=CNN()
    
    if(classifier =='CNN'):
      model.fit(X_train, Y_train, epochs=100, batch_size=12, verbose=0)
       # evaluate the model
      scores = model.evaluate(X_test, Y_test, verbose=1)
      acc = scores[1]
    
    else:
      model = Model(inputs=model.inputs, outputs=model.layers[-4].output)
      Training_features = np.array(model.predict(X_train))
      Training_labels = np.argmax(Y_train,axis=1)

      Testing_Features = np.array(model.predict(X_test))
      Testing_labels = np.argmax(Y_test,axis=1)

      if (classifier == "LR"):
        clf = LogisticRegression(max_iter=1000, random_state=1)
      elif (classifier == "RF"):
        clf = RandomForestClassifier(random_state=1)
      elif (classifier == "SVM"):
        clf = SVC(kernel='poly', degree=3, C=50)
      elif (classifier == "GB"):
        clf=GradientBoostingClassifier(random_state=1)
      else:
        print("Invalid Classifier")
        return

      scores = clf.fit(Training_features, Training_labels)
      print("Completed.")
      acc = clf.score(Testing_Features,Testing_labels)

    print("Accuracy: ", acc*100)
    accuracy.append(acc*100)
    fold=fold+1

  print("Total Accuracy using %s: "%classifier, np.mean(accuracy),"+/-", np.std(accuracy))

def main():

  """

  entry point of the program. It prompts the user to choose a classifier and a dataset file, loads and preprocesses the data, 
  and trains the selected classifier using k-fold cross-validation.

  """
  
  print("Choose one from the following classifiers ")
  print("1) CNN for Convolutional Neural Network")
  print("2) LR for Logistic Regression")
  print("3) RF for Random Forest")
  print("4) SVM for Support vector Machine")
  print("5) GB for Gradient Boosting Machine")

  # Enter the path of the directory where your code is stored
  os.chdir('/content/drive/MyDrive/Schizophrenia_ResearchPaper/Individual_Channel')

  classifier=str(input("Enter the name of classifier: " ))
  filename=str(input("Enter the name of dataset file (EEG channel name): "))
  

  print("Dataset Loading...")
  X, Y = load_dataset("./Dataset/%s.csv"%filename)
  print("Dataset Loaded")

  X_pre = dataPreprocessing(X, Y)
  print("Preprocessing Completed")
  
  train(X_pre, Y, classifier)    



In this section, we will train convolutional neural network on each EEG channel separately to study the effect of each channel on the Schizophrenia disease.



Results:

---


The results for CNN and logistic Regression on Fp1 channel are given below. To find the accuracy for other channels or models, you can use the same main() function. Only Specify the name of dataset file and the model name in the input and the main() will train the model on the specified dataset and return the accuracy. The remaining results are given in the paper.



> Note:
Results may vary as on every run, CNN initialize weights randomly, therefore during the learning process these will get updated differently. In the paper, We have trained the CNN multiple times and those weights are stored on which maximum accuracy is achieved.

In [None]:
# Results of Fp1 on CNN
main()

Choose one from the following classifiers 
1) CNN for Convolutional Neural Network
2) LR for Logistic Regression
3) RF for Random Forest
4) SVM for Support vector Machine
5) GB for Gradient Boosting Machine
Dataset Loading...
Dataset Loaded
Preprocessing Completed
Training on Fold 1 ...
Accuracy:  75.0
Training on Fold 2 ...
Accuracy:  54.86111044883728
Training on Fold 3 ...
Accuracy:  70.83333134651184
Training on Fold 4 ...
Accuracy:  72.72727489471436
Training on Fold 5 ...
Accuracy:  65.73426723480225
Training on Fold 6 ...
Accuracy:  74.12587404251099
Training on Fold 7 ...
Accuracy:  71.32866978645325
Training on Fold 8 ...
Accuracy:  72.02796936035156
Training on Fold 9 ...
Accuracy:  69.2307710647583
Training on Fold 10 ...
Accuracy:  75.52447319030762
Total Accuracy using CNN:  70.13937413692474 +/- 5.782153991189042


In [None]:
# Results of Fp1 on Logistic Regression
main()

Choose one from the following classifiers 
1) CNN for Convolutional Neural Network
2) LR for Logistic Regression
3) RF for Random Forest
4) SVM for Support vector Machine
5) GB for Gradient Boosting Machine
Enter the name of classifier: LR
Enter the name of dataset file (EEG channel name): Fp1
Dataset Loading...
Dataset Loaded
Preprocessing Completed
Training on Fold 1 ...
Completed.
Accuracy:  68.05555555555556
Training on Fold 2 ...
Completed.
Accuracy:  58.333333333333336
Training on Fold 3 ...
Completed.
Accuracy:  65.97222222222221
Training on Fold 4 ...
Completed.
Accuracy:  66.43356643356644
Training on Fold 5 ...
Completed.
Accuracy:  62.93706293706294
Training on Fold 6 ...
Completed.
Accuracy:  60.83916083916085
Training on Fold 7 ...
Completed.
Accuracy:  66.43356643356644
Training on Fold 8 ...
Completed.
Accuracy:  57.34265734265735
Training on Fold 9 ...
Completed.
Accuracy:  69.23076923076923
Training on Fold 10 ...
Completed.
Accuracy:  66.43356643356644
Total Accuracy 