## **DeceptionGame Dataset Analysis Part 1**

In [None]:
import os
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

#### **Download Data**
This should take some time, the dataset is very large even preprocessed.

In [None]:
!wget -O preprocessed_dataset.zip "https://figshare.com/ndownloader/files/43502097"
!unzip preprocessed_dataset.zip

--2025-03-21 20:35:08--  https://figshare.com/ndownloader/files/43502097
Resolving figshare.com (figshare.com)... 52.17.159.36, 52.30.109.106, 52.49.42.6, ...
Connecting to figshare.com (figshare.com)|52.17.159.36|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/43502097/Preprocessed.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20250321/eu-west-1/s3/aws4_request&X-Amz-Date=20250321T203509Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=91127827496419c9fa2b68cdb19a04522e79fc3cf90fae7bf2e495736e2da1e9 [following]
--2025-03-21 20:35:09--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/43502097/Preprocessed.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20250321/eu-west-1/s3/aws4_request&X-Amz-Date=20250321T203509Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=91127827496419c9fa2b68cdb19a04522e79fc3cf90fae7bf2e495736e2da1e

#### **Understanding Data Structure**
Here, we perform some exploratory commands to get a better grasp of the shape and form of the data. We then load it in and extract the instructed truth and lie data.

In [None]:
# Load .mat file
mat_data = sio.loadmat('/content/Preprocessed/DecisionMaking/Player_sub01_Observer_sub02.mat') #if needed, replace with appropriate path

print(mat_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'observer', 'player'])


In [None]:
# check the data type of 'observer' and 'player', these should be numpy arrays
print(type(mat_data['observer']))
print(type(mat_data['player']))

# Extract observer and player data
observer_data = mat_data['observer']
player_data = mat_data['player']

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
type(player_data[0][0][1]) #this should also be a numpy array

numpy.ndarray

In [None]:
x = player_data[0][0][1]
x

array([[[-7.18764830e+00,  1.22919858e+00,  1.81678867e+00, ...,
         -3.17998007e-02,  7.14602321e-03,  4.38691902e+00],
        [-6.71852255e+00,  1.22490931e+00,  2.86546040e+00, ...,
          7.31438875e-01,  1.42970324e+00,  6.39665174e+00],
        [-5.53971100e+00, -1.23877919e+00,  7.66455460e+00, ...,
          8.11694801e-01, -1.58020723e+00,  4.86925459e+00],
        ...,
        [-3.65176725e+00, -9.83069777e-01,  7.05432653e+00, ...,
         -3.05836296e+00, -2.73539591e+00,  4.22301978e-01],
        [-4.28767014e+00, -6.27262235e-01,  3.57845688e+00, ...,
         -2.18507266e+00,  2.50094247e+00,  1.80136287e+00],
        [-1.57916006e-02,  1.03468634e-02,  3.39718556e+00, ...,
         -2.53226328e+00, -4.62264919e+00,  1.25445604e+00]],

       [[-5.54400253e+00, -1.55448639e+00, -3.51589417e+00, ...,
         -5.55847597e+00, -2.27181697e+00, -1.18284665e-01],
        [-6.30074787e+00, -5.78233778e-01,  5.55397391e-01, ...,
         -3.71414137e+00,  6.15843654e

In [None]:
data_dir = "/content/Preprocessed/DecisionMaking"
mat_files = [f for f in os.listdir(data_dir) if f.endswith(".mat")]


all_x, all_y = [], []


for file in mat_files:
    mat_data = sio.loadmat(os.path.join(data_dir, file))

    player_data = mat_data['player'][0, 0]

    # Identify Instructed Lie and Instructed Truth trials
    instructed_lie_trials = np.where(player_data['y'][1, :] == 1)[0]
    instructed_truth_trials = np.where(player_data['y'][2, :] == 1)[0]


    # Extract EEG data for selected trials and specific channel
    x_lie = player_data['x'][:, :, instructed_lie_trials]  # Shape: (timepoints, num of channels, trials)
    x_truth = player_data['x'][:, :, instructed_truth_trials]  # Shape: (timepoints,num of channels, trials)
    #min_trials = min(x_lie.shape[2], x_truth.shape[2])

    x = np.concatenate([x_lie, x_truth], axis=2)  # Combine along trials axis - 2
    y = np.concatenate([
        np.zeros(len(instructed_lie_trials)),
        np.ones(len(instructed_truth_trials))
    ])

    all_x.append(x)
    all_y.append(y)


In [None]:
X_data = np.concatenate(all_x, axis=2).transpose(2, 1, 0)  # Combine trials/ samples, channels, time
y_data = np.concatenate(all_y)  # Combine labels

### **Model Development**
#### **Random Forest**
We choose random forest as the first model because of its affinity for high-dimensional data and noise handling.

In [10]:
# Random Forest Model
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


def extract_features(data_X):
    """
    Extracts statistical and frequency domain features from EEG signals.
    """
    num_samples, num_channels, num_timesteps = data_X.shape
    features = []

    for sample in range(num_samples):
        sample_features = []
        for ch in range(num_channels):
            signal = data_X[sample, ch, :]

            # Statistical features
            sample_features.extend([
                np.mean(signal),
                np.std(signal),
                skew(signal),
                kurtosis(signal)
            ])

            # Frequency domain features using Welch’s method
            freqs, psd = welch(signal, fs=100, nperseg=100)  # Assuming EEG sampled at 256 Hz
            power_bands = [
                np.mean(psd[(freqs >= 1) & (freqs < 4)]),  # Delta
                np.mean(psd[(freqs >= 4) & (freqs < 8)]),  # Theta
                np.mean(psd[(freqs >= 8) & (freqs < 13)]),  # Alpha
                np.mean(psd[(freqs >= 13) & (freqs < 30)]),  # Beta
                np.mean(psd[(freqs >= 30) & (freqs < 50)])   # Gamma
            ]
            sample_features.extend(power_bands)

        features.append(sample_features)

    return np.array(features)

X_features = extract_features(X_data)
y_labels = y_data

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)

# Train Random Forest
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Random Forest Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 0.57


In [11]:
# Count occurrences of 0 and 1
num_zeros = np.sum(y_data == 0)
num_ones = np.sum(y_data == 1)

print(f"Number of 0s: {num_zeros}")
print(f"Number of 1s: {num_ones}")

Number of 0s: 3047
Number of 1s: 2782


#### ERP With MLP Model Development

First, we average EEG to compute ERP, which will simplify analysis and suppress randomness. Then, we run the next model, MLP, which we chose because it can model complex relationships between ERP signals and class labels without requiring large datasets. We convert ERP features and labels into tensors, then split with an 80/20 split with batch size 8.

The architeture of the model is as follows:

Layer 1: 4 -> 64

ReLU

Layer 2: 64 -> 32

ReLU

Layer 3: 32 -> 1

Sigmoid

In [12]:
from scipy.signal import butter, filtfilt

def bandpass_filter(data, lowcut=1, highcut=30, fs=100, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, data, axis=0)  # Apply along time point axis

# Apply filtering
X_data_filtered = bandpass_filter(X_data)

In [13]:
X_truth = X_data_filtered[y_data == 1]
X_lie = X_data_filtered[y_data == 0]

# Compute mean ERP for each condition
erp_truth = np.mean(X_truth, axis=0)  # Shape: num_channels, timepoints
erp_lie = np.mean(X_lie, axis=0)

In [14]:
import numpy as np

def smooth_erp(erp_data, window_size=10):
    """
    Further smooth the erp_data. you guys can modify the window_size
    """
    num_chnl, num_times = erp_data.shape
    smoothed_erp = np.zeros_like(erp_data)

    for ch in range(num_chnl):
        smoothed_signal = np.convolve(erp_data[ch, :], np.ones(window_size)/window_size, mode='same')
        smoothed_erp[ch, :] = smoothed_signal
    return smoothed_erp  # Output shape: num_channels, timepoints


# Apply smoothing
erp_truth_smooth = smooth_erp(erp_truth)
erp_lie_smooth = smooth_erp(erp_lie)

In [15]:
fs = 100  # All data is 100 Hz

erp_windows = {
    "seg1": (15, 24),
    "seg2": (25, 35),
    "seg3": (35, 45),
    "seg4": (45, 60),
}

def extract_erp_features(erp_data):
    features = []
    for comp, (start, end) in erp_windows.items():
        mean_amplitude = np.mean(erp_data[:, start:end], axis=1)  # Mean per component
        features.append(mean_amplitude)
    return np.array(features).T  # Shape: (num_channels, num_components)

# Extract ERP features
features_truth = extract_erp_features(erp_truth_smooth)
features_lie = extract_erp_features(erp_lie_smooth)

# Concatenate
X_features = np.concatenate([features_truth, features_lie], axis=0)  # Shape: (num_samples, num_features)
y_labels = np.concatenate([np.ones(len(features_truth)), np.zeros(len(features_lie))])

In [16]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Convert your ERP features and labels into tensors
X_features_tensor = torch.tensor(X_features, dtype=torch.float32)  # (60, 4)
y_labels_tensor = torch.tensor(y_labels, dtype=torch.float32).unsqueeze(1)  # (60, 1)

# Train-test split
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(
    X_features_tensor, y_labels_tensor, test_size=0.2, random_state=42
)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Define MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim=4, hidden_dim=64):
        super(MLPModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# Initialize
model = MLPModel()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu()
        all_preds.append(preds)
        all_labels.append(yb)

y_pred = torch.cat(all_preds).numpy()
y_true = torch.cat(all_labels).numpy()
y_pred_labels = (y_pred > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_true, y_pred_labels))
print(classification_report(y_true, y_pred_labels))

Epoch 1/50, Loss: 0.6976
Epoch 2/50, Loss: 0.6932
Epoch 3/50, Loss: 0.6925
Epoch 4/50, Loss: 0.6921
Epoch 5/50, Loss: 0.6915
Epoch 6/50, Loss: 0.6912
Epoch 7/50, Loss: 0.6913
Epoch 8/50, Loss: 0.6907
Epoch 9/50, Loss: 0.6905
Epoch 10/50, Loss: 0.6906
Epoch 11/50, Loss: 0.6906
Epoch 12/50, Loss: 0.6903
Epoch 13/50, Loss: 0.6901
Epoch 14/50, Loss: 0.6901
Epoch 15/50, Loss: 0.6899
Epoch 16/50, Loss: 0.6899
Epoch 17/50, Loss: 0.6898
Epoch 18/50, Loss: 0.6900
Epoch 19/50, Loss: 0.6899
Epoch 20/50, Loss: 0.6899
Epoch 21/50, Loss: 0.6898
Epoch 22/50, Loss: 0.6902
Epoch 23/50, Loss: 0.6902
Epoch 24/50, Loss: 0.6899
Epoch 25/50, Loss: 0.6898
Epoch 26/50, Loss: 0.6899
Epoch 27/50, Loss: 0.6898
Epoch 28/50, Loss: 0.6898
Epoch 29/50, Loss: 0.6899
Epoch 30/50, Loss: 0.6901
Epoch 31/50, Loss: 0.6896
Epoch 32/50, Loss: 0.6898
Epoch 33/50, Loss: 0.6899
Epoch 34/50, Loss: 0.6898
Epoch 35/50, Loss: 0.6898
Epoch 36/50, Loss: 0.6897
Epoch 37/50, Loss: 0.6897
Epoch 38/50, Loss: 0.6901
Epoch 39/50, Loss: 0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### **K-Fold Cross Validation**
We use a K-Fold Cross Validation to verify accuracy percentage between different subsets of training and test data.

In [17]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Prepare tensors
X_features_tensor = torch.tensor(X_features, dtype=torch.float32)  # Shape: (60, 4)
y_labels_tensor = torch.tensor(y_labels, dtype=torch.float32).unsqueeze(1)  # Shape: (60, 1)

# Define MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim=4, hidden_dim=64):
        super(MLPModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# Settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.BCELoss()
num_epochs = 50
batch_size = 8
k_folds = 5

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
y_np = y_labels_tensor.squeeze().numpy()

all_fold_accuracies = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_features, y_np)):
    print(f"\n Fold {fold + 1}/{k_folds}")

    # Split the data
    X_train, X_test = X_features_tensor[train_idx], X_features_tensor[test_idx]
    y_train, y_test = y_labels_tensor[train_idx], y_labels_tensor[test_idx]

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Initialize a fresh model
    model = MLPModel().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train loop
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        # Uncomment to monitor training loss
        # print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(train_loader):.4f}")

    # Evaluation
    model.eval()
    y_preds, y_trues = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            output = model(xb).cpu()
            y_preds.append(output)
            y_trues.append(yb)

    y_pred = torch.cat(y_preds).numpy()
    y_true = torch.cat(y_trues).numpy()
    y_pred_labels = (y_pred > 0.5).astype(int)

    acc = accuracy_score(y_true, y_pred_labels)
    all_fold_accuracies.append(acc)

    print("Fold Accuracy:", acc)
    print(classification_report(y_true, y_pred_labels, digits=4))

# Summary
print("\Average Accuracy over 5 folds:", np.mean(all_fold_accuracies))


 Fold 1/5
Fold Accuracy: 0.5
              precision    recall  f1-score   support

         0.0     0.5000    1.0000    0.6667         6
         1.0     0.0000    0.0000    0.0000         6

    accuracy                         0.5000        12
   macro avg     0.2500    0.5000    0.3333        12
weighted avg     0.2500    0.5000    0.3333        12


 Fold 2/5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

         0.0     0.7500    0.5000    0.6000         6
         1.0     0.6250    0.8333    0.7143         6

    accuracy                         0.6667        12
   macro avg     0.6875    0.6667    0.6571        12
weighted avg     0.6875    0.6667    0.6571        12


 Fold 3/5
Fold Accuracy: 0.5833333333333334
              precision    recall  f1-score   support

         0.0     1.0000    0.1667    0.2857         6
         1.0     0.5455    1.0000    0.7059         6

    accuracy                         0.5833        12
   macro avg     0.7727    0.5833    0.4958        12
weighted avg     0.7727    0.5833    0.4958        12


 Fold 4/5
Fold Accuracy: 0.5
              precision    recall  f1-score   support

         0.0     0.5000    1.0000    0.6667         6
         1.0     0.0000    0.0000    0.0000         6

    accuracy                         0.5000        12
   macro avg     0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold Accuracy: 0.3333333333333333
              precision    recall  f1-score   support

         0.0     0.3333    0.3333    0.3333         6
         1.0     0.3333    0.3333    0.3333         6

    accuracy                         0.3333        12
   macro avg     0.3333    0.3333    0.3333        12
weighted avg     0.3333    0.3333    0.3333        12

\Average Accuracy over 5 folds: 0.5166666666666667
