## **EEG Alcoholism**

### เริ่มแรกจากการ import **Libraly** และ load **Dataset**

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('./Dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
%pip install mne

In [None]:
%pip install PyWavelets

In [8]:
# Base libraries
from multiprocessing import Pool

import os
import numpy as np
import mne
import pandas as pd
import random

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.fftpack import fft, fftfreq, rfft, rfftfreq
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.preprocessing import robust_scale
import mne
import matplotlib
from collections import defaultdict
from math import cos, sin, acos, radians, pi
from scipy.interpolate import griddata
from numpy import newaxis
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


# Wavelets
import pywt

In [9]:
test_dir = './Dataset/SMNI_CMI_TEST'
train_dir = './Dataset/SMNI_CMI_TRAIN'

In [None]:
# convert to a pandas dataframe
def read_csv(filename):
    'converts a filename to a pandas dataframe'
    return pd.read_csv(filename)


# get a list of file names
files_train = os.listdir(train_dir)

files_test = os.listdir(test_dir)

file_list_train = [os.path.join(train_dir, filename) for filename in files_train if '.' in filename and filename.split('.')[-1] == 'csv']
file_list_test = [os.path.join(test_dir, filename) for filename in files_test if '.' in filename and filename.split('.')[-1] == 'csv']

# set up pool
with Pool(processes=12) as pool:
    df_list_train = pool.map(read_csv, file_list_train)
    df_list_test =  pool.map(read_csv, file_list_test)

# combine the list of dataframes to a single dataframe
combined_df_train = pd.concat(df_list_train, ignore_index=True)
combined_df_test = pd.concat(df_list_test, ignore_index=True)
combined_df = pd.concat([combined_df_train,combined_df_test], ignore_index=True)


EEG_data = combined_df[combined_df['subject identifier'] == 'a']
EEG_data_control = combined_df[combined_df['subject identifier'] == 'c']

EEG_data.head()

## **Data Cleaning**

## Band Pass Filter<br>
ทำความสะอาดข้อมูลด้วยการทำ bandpass filter เพื่อคัดเอาข้อมูลช่วงที่ไม่ต้องการออก โดยจะคัดคลื่นความถี่เหลือแค่ในช่วง 1 Hz ถึง 30 Hz จากนั้นจะนำ filter นี้ไปใช้กับข้อมูลคนคนที่ติด alcohol (EEG_data) กับ คนที่ไม่ได้ติด alcohol (EEG_data_control)

In [None]:
from scipy.signal import butter, lfilter

In [None]:
FS= 256
# Butterworth Bandpass Filter function
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):   #order=5: The filter's order. Higher orders give sharper frequency cutoffs but can distort the signal.
    nyq = 0.5 * fs   #nyq = 0.5 * fs: The Nyquist frequency, half of the sampling rate, which represents the highest frequency that can be analyzed.
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    y = lfilter(b, a, data)
    return y

# Apply to the sensor value
def apply_filter(df):
  # Lower and upper cutoff frequencies for the bandpass filter (e.g., 1 Hz to 30 Hz).
    df['filtered_sensor_value'] = butter_bandpass_filter(df['sensor value'], 1, 30, FS)
    return df

# Apply the filter to both alcohol (EEG_data) and control (EEG_data_control)
EEG_data_filtered = EEG_data.groupby(['name', 'trial number', 'sensor position']).apply(apply_filter).reset_index(drop=True)
EEG_data_control_filtered = EEG_data_control.groupby(['name', 'trial number', 'sensor position']).apply(apply_filter).reset_index(drop=True)

plot graph เพื่อแสดงตัวอย่างหลักจากที่ทำการ filter แล้ว

In [None]:
# Function to plot filtered data for selected sensors
def plot_filtered_data(df_alcohol, df_control, sensors):
    for sensor in sensors:
        plt.figure(figsize=(15, 5))
        for condition in df_alcohol['matching condition'].unique():
            plt.subplot(1, len(df_alcohol['matching condition'].unique()),
                        list(df_alcohol['matching condition'].unique()).index(condition) + 1)

            # For alcohol group
            subset_alcohol = df_alcohol[(df_alcohol['sensor position'] == sensor) &
                                        (df_alcohol['matching condition'] == condition)]
            if not subset_alcohol.empty:
                subset_alcohol.groupby('time')['filtered_sensor_value'].mean().plot(
                    label='Alcohol Group', color='blue', linewidth=1.5)

            # For control group
            subset_control = df_control[(df_control['sensor position'] == sensor) &
                                        (df_control['matching condition'] == condition)]
            if not subset_control.empty:
                subset_control.groupby('time')['filtered_sensor_value'].mean().plot(
                    label='Control Group', color='orange', linewidth=1.5)

            plt.title(f'Sensor {sensor} - Condition: {condition}')
            plt.xlabel('Time (s)')
            plt.ylabel('Filtered Sensor Value (µV)')

            # Only show legend if there is something to show
            if not subset_alcohol.empty or not subset_control.empty:
                plt.legend()

        plt.tight_layout()
        plt.show()

# Specify sensors
sensors_to_plot = ['AF1', 'FP1', 'CZ']  # Replace with actual sensor names
plot_filtered_data(EEG_data_filtered, EEG_data_control_filtered, sensors_to_plot)

การใช้ EEG ในการตรวจจับอาการติดสุราเกิดจากความสามารถในการบันทึกสัญญาณไฟฟ้าที่แสดงออกมาในรูปแบบที่ต่างกันระหว่างผู้ที่ดื่มแอลกอฮอล์และผู้ที่ไม่ดื่ม ซึ่งความถี่ของสัญญาณ EEG ถูกแบ่งออกเป็นกลุ่มต่างๆ คือ 
* **Delta (0.5 - 4 Hz):** เกี่ยวข้องกับการหลับลึกหรือช่วงที่ไม่ได้สติ
* **Theta (4 - 8 Hz):** เกี่ยวข้องกับอาการง่วงนอน, ความรู้สึกผ่อนคลาย และการหลับไม่สนิท รวมถึงการสร้างความจำและความคิดสร้างสรรค์อีกด้วย
* **Alpha (8 - 13 Hz):** เกี่ยวข้องกับความผ่อนคลายหรือช่วงที่รู้สึกสงบสงบ มักพบเมื่อหลับตาและสมองอยู่ในสภาวะพักผ่อน
* **Beta (13 - 30 Hz):** เกี่ยวข้องกับการใช้ความคิด, การใช้สมาธิ, และการแก้ปัญหา มัก
เกิดขึ้นระหว่าการทำงกิจกรราและการเคลื่อนไหs.

In [None]:
EEG_data=EEG_data_filtered
EEG_data_control=EEG_data_control_filtered

ต่อไปจะทำการแปลง sensor position ให้อยู่ในรูปแบบของ 10/10 System

In [None]:
EEG_data.head() # Before

In [None]:
# standardizing the sensor position naming convention

#

EEG_data = EEG_data.drop(['Unnamed: 0'], axis=1)
EEG_data.loc[EEG_data['matching condition'] == 'S2 nomatch,', 'matching condition'] =  'S2 nomatch' ## remove comma sign e


EEG_data.loc[EEG_data['sensor position'] == 'AF1', 'sensor position'] = 'AF3'
EEG_data.loc[EEG_data['sensor position'] == 'AF2', 'sensor position'] = 'AF4'
EEG_data.loc[EEG_data['sensor position'] == 'PO1', 'sensor position'] = 'PO3'
EEG_data.loc[EEG_data['sensor position'] == 'PO2', 'sensor position'] = 'PO4'

EEG_data.loc[EEG_data['sensor position'] == 'FP1', 'sensor position'] = 'Fp1'
EEG_data.loc[EEG_data['sensor position'] == 'FP2', 'sensor position'] = 'Fp2'
EEG_data.loc[EEG_data['sensor position'] == 'CPZ', 'sensor position'] = 'CPz'
EEG_data.loc[EEG_data['sensor position'] == 'FZ', 'sensor position'] = 'Fz'

EEG_data.loc[EEG_data['sensor position'] == 'CZ', 'sensor position'] = 'Cz'

EEG_data.loc[EEG_data['sensor position'] == 'PZ', 'sensor position'] = 'Pz'
EEG_data.loc[EEG_data['sensor position'] == 'FPZ', 'sensor position'] = 'Fpz'
EEG_data.loc[EEG_data['sensor position'] == 'AFZ', 'sensor position'] = 'AFz'
EEG_data.loc[EEG_data['sensor position'] == 'FCZ', 'sensor position'] = 'FCz'

EEG_data.loc[EEG_data['sensor position'] == 'POZ', 'sensor position'] = 'POz'
EEG_data.loc[EEG_data['sensor position'] == 'OZ', 'sensor position'] = 'Oz'

## Control Group


EEG_data_control = EEG_data_control.drop(['Unnamed: 0'], axis=1)
EEG_data_control.loc[EEG_data_control['matching condition'] == 'S2 nomatch,', 'matching condition'] =  'S2 nomatch'
## replace some 'sensor position' values
EEG_data_control.loc[EEG_data_control['sensor position'] == 'AF1', 'sensor position'] = 'AF3'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'AF2', 'sensor position'] = 'AF4'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'PO1', 'sensor position'] = 'PO3'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'PO2', 'sensor position'] = 'PO4'

EEG_data_control.loc[EEG_data_control['sensor position'] == 'FP1', 'sensor position'] = 'Fp1'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'FP2', 'sensor position'] = 'Fp2'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'CPZ', 'sensor position'] = 'CPz'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'FZ', 'sensor position'] = 'Fz'

EEG_data_control.loc[EEG_data_control['sensor position'] == 'CZ', 'sensor position'] = 'Cz'

EEG_data_control.loc[EEG_data_control['sensor position'] == 'PZ', 'sensor position'] = 'Pz'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'FPZ', 'sensor position'] = 'Fpz'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'AFZ', 'sensor position'] = 'AFz'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'FCZ', 'sensor position'] = 'FCz'

EEG_data_control.loc[EEG_data_control['sensor position'] == 'POZ', 'sensor position'] = 'POz'
EEG_data_control.loc[EEG_data_control['sensor position'] == 'OZ', 'sensor position'] = 'Oz'
EEG_data_control.head() # After

In [None]:
EEG_data.to_csv('EEG_data.csv')
EEG_data_control.to_csv('EEG_data_control.csv')

In [None]:
# Organzing according to experiment condition groups

#Alc_Groups
Alc_S1Obj = EEG_data[EEG_data['matching condition'] == 'S1 obj']
Alc_S2Match = EEG_data[EEG_data['matching condition'] == 'S2 match']
Alc_S2Nomatch = EEG_data[EEG_data['matching condition'] == 'S2 nomatch']

#Control groups
Con_S1Obj = EEG_data_control[EEG_data_control['matching condition'] == 'S1 obj']
Con_S2Match = EEG_data_control[EEG_data_control['matching condition'] == 'S2 match']
Con_S2Nomatch = EEG_data_control[EEG_data_control['matching condition'] == 'S2 nomatch']


def trial_len_integrity_check(inputDF):

  nameAndTrialNumber = inputDF.groupby(['trial number','name']).size().reset_index().rename(columns={0:'count'})
  nameAndTrialNumber['count'] =np.where(nameAndTrialNumber['count'] == 16384,False,True)

  del inputDF

  if nameAndTrialNumber['count'].any():
    # data integrity problem
    Exception('Data Integrity problem, one of the arrays is not shaped 256x256')


  return nameAndTrialNumber[['trial number','name']]


# Creating dataframes that contain relevant indexing information and checks data integrity
# Trial Number, and subject names

Con_S1Obj_Index = trial_len_integrity_check(Con_S1Obj)
Con_S2Match_Index = trial_len_integrity_check(Con_S2Match)
Con_S2Nomatch_Index = trial_len_integrity_check(Con_S2Nomatch)

Alc_S1Obj_Index = trial_len_integrity_check(Alc_S1Obj)
Alc_S2Match_Index = trial_len_integrity_check(Alc_S2Match)
Alc_S2Nomatch_Index = trial_len_integrity_check(Alc_S2Nomatch)

All_Alcs = pd.concat([Alc_S1Obj_Index,Alc_S2Match_Index,Alc_S2Nomatch_Index])
All_Cons = pd.concat([Con_S1Obj_Index,Con_S2Match_Index,Con_S2Nomatch_Index])

All_Alcs.reset_index(drop=True,inplace=True)
All_Cons.reset_index(drop=True,inplace=True)

In [None]:
EEG_data_agg = EEG_data.groupby(['time', 'sensor position']).agg({'sensor value': 'mean'}).reset_index()
EEG_data_pivoted = EEG_data_agg.pivot(index='time', columns='sensor position', values='sensor value')
EEG_data_pivoted.drop(columns=['X','Y','nd'],inplace=True)
EEG_data_pivoted

In [None]:
EEG_data_pivoted.to_csv('EEG_data_pivoted.csv')

In [None]:
EEG_data_control_agg = EEG_data_control.groupby(['time', 'sensor position']).agg({'sensor value': 'mean'}).reset_index()
EEG_data_control_pivoted = EEG_data_control_agg.pivot(index='time', columns='sensor position', values='sensor value')
EEG_data_control_pivoted.drop(columns=['X','Y','nd'],inplace=True)
EEG_data_control_pivoted

In [None]:
EEG_data_control_pivoted.to_csv('EEG_data_control_pivoted.csv')

In [None]:
file_path_alcoholic = 'EEG_data_pivoted.csv'
file_path_control = 'EEG_data_control_pivoted.csv'

eeg_alcoholic = pd.read_csv(file_path_alcoholic)
eeg_control = pd.read_csv(file_path_control)

In [None]:
eeg_alcoholic.columns

Plot graph แสดงความแตกต่างระหว่างคนที่ไม่ กับไม่ติด alcohol ในแต่ละ sensor position

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

sample_sp_column = eeg_alcoholic.columns
for column in sample_sp_column:
    if column not in ['time', 'label']:
        plt.figure(figsize=(15, 5))
        plt.scatter(eeg_alcoholic['time'], eeg_alcoholic[column], label=f'Alcoholic {column}', color='blue', alpha=0.5)
        plt.scatter(eeg_control['time'], eeg_control[column], label=f'Control {column}', color='orange', alpha=0.5)
        plt.title(f'EEG Data - Alcoholic Group vs Control Group for {column}')
        plt.xlabel('Time')
        plt.ylabel('Sensor Value')
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
        plt.tight_layout()
        plt.show()

## **Model Training and Fitting**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Adding a label to distinguish between alcoholic (1) and control (0)
eeg_alcoholic['label'] = 1
eeg_control['label'] = 0

# Combining both datasets
eeg_combined = pd.concat([eeg_alcoholic, eeg_control], ignore_index=True)

# Defining features (sensor data) and label
X = eeg_combined.drop(['time', 'label'], axis=1)
y = eeg_combined['label']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Standardizing has a significant impact on accuracy.
# X_train_scaled = X_train
# X_test_scaled = X_test

# Training the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)

# Making predictions
y_pred = knn.predict(X_test_scaled)

## **Model Evaluating**

In [None]:
# Evaluating the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Plotting the confusion matrix
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Displaying the classification report in a tabular form
report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

# Showing the classification report
print("Classification Report:")
print(report_df)

## **Classification Report Summary**

In [None]:
from sklearn.metrics import accuracy_score

# Extracting the overall metrics from the classification report
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Computing the overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Extracting precision, recall, and F1-score for each class
precision_alcoholic = report_dict['1']['precision']
recall_alcoholic = report_dict['1']['recall']
f1_alcoholic = report_dict['1']['f1-score']

precision_control = report_dict['0']['precision']
recall_control = report_dict['0']['recall']
f1_control = report_dict['0']['f1-score']

# Extracting macro and weighted averages
macro_avg_precision = report_dict['macro avg']['precision']
macro_avg_recall = report_dict['macro avg']['recall']
macro_avg_f1 = report_dict['macro avg']['f1-score']

weighted_avg_precision = report_dict['weighted avg']['precision']
weighted_avg_recall = report_dict['weighted avg']['recall']
weighted_avg_f1 = report_dict['weighted avg']['f1-score']

# Printing the overall report
print("Overall Classification Report Summary:")
print(f"Accuracy: {accuracy:.2f}")
print("\nClass 1 (Alcoholic):")
print(f"  Precision: {precision_alcoholic:.2f}")
print(f"  Recall: {recall_alcoholic:.2f}")
print(f"  F1-Score: {f1_alcoholic:.2f}")
print("\nClass 0 (Control):")
print(f"  Precision: {precision_control:.2f}")
print(f"  Recall: {recall_control:.2f}")
print(f"  F1-Score: {f1_control:.2f}")
print("\nMacro Average:")
print(f"  Precision: {macro_avg_precision:.2f}")
print(f"  Recall: {macro_avg_recall:.2f}")
print(f"  F1-Score: {macro_avg_f1:.2f}")
print("\nWeighted Average:")
print(f"  Precision: {weighted_avg_precision:.2f}")
print(f"  Recall: {weighted_avg_recall:.2f}")
print(f"  F1-Score: {weighted_avg_f1:.2f}")