

# Important Libraries

In [None]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


#  CONFIG 

In [None]:

DATASET_PATH = r"YOUR_DATASET_PATH"
SAMPLE_RATE = 24414
N_MFCC = 13
MAX_LEN = 96
TEST_SIZE = 0.2
VAL_SIZE = 0.2
SEED = 42

 # Feature Extraction

In [None]:

features = []
labels = []

emotion_folders = sorted([d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))])
emotion_to_label = {emotion: idx for idx, emotion in enumerate(emotion_folders)}
print("Emotion to Label Mapping:", emotion_to_label)

for emotion in emotion_folders:
    emotion_dir = os.path.join(DATASET_PATH, emotion)
    for file in os.listdir(emotion_dir):
        if file.endswith(".wav"):
            file_path = os.path.join(emotion_dir, file) #Gets the full path of the audio file (needed for loading).
            try:
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=N_MFCC)

                if mfccs.shape[1] < MAX_LEN:
                    pad_width = MAX_LEN - mfccs.shape[1]
                    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
                else:
                    mfccs = mfccs[:, :MAX_LEN]

                features.append(mfccs)
                labels.append(emotion_to_label[emotion])
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# === Convert to numpy arrays ===
X = np.array(features)
y = np.array(labels)

Emotion to Label Mapping: {'YAF_angry': 0, 'YAF_disgust': 1, 'YAF_fear': 2, 'YAF_happy': 3, 'YAF_neutral': 4, 'YAF_pleasant_surprised': 5, 'YAF_sad': 6}


# Encode and one-hot

In [None]:

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_cat = to_categorical(y_encoded)

#  Split Data 

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=TEST_SIZE, random_state=SEED, stratify=y_cat)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VAL_SIZE, random_state=SEED, stratify=y_train)

#  Save as .npy 

In [None]:

os.makedirs("processed_npy", exist_ok=True)
np.save("processed_npy/X_train.npy", X_train)
np.save("processed_npy/X_val.npy", X_val)
np.save("processed_npy/X_test.npy", X_test)
np.save("processed_npy/y_train.npy", y_train)
np.save("processed_npy/y_val.npy", y_val)
np.save("processed_npy/y_test.npy", y_test)

print("\n✅ All MFCC features and labels saved in 'processed_npy/' folder.")



✅ All MFCC features and labels saved in 'processed_npy/' folder.


# GPT2 model

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModel

# ==== CONFIG ====
config = {
    'gpt_path': 'gpt2',
    'd_model': 128,
    'patch_len': 4,
    'stride': 2,
    'dropout': 0.1,
    'llm_dim': 768,
    'num_classes': 7  # ← update this based on your number of emotion labels
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ==== MODEL COMPONENTS ====
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, d_model, patch_len, stride, dropout):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Conv1d(in_channels, d_model, kernel_size=patch_len, stride=stride),
            nn.BatchNorm1d(d_model),
            nn.ReLU(),
            nn.Conv1d(d_model, d_model, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(d_model),
            nn.ReLU()
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):  # x: (B, C, L)
        x = self.proj(x)   # (B, D, L')
        return self.dropout(x.transpose(1, 2))  # (B, L', D)

class ReprogrammingLayer(nn.Module):
    def __init__(self, d_model, d_llm, dropout=0.1):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_llm)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):  # (B, L', D)
        return self.dropout(self.proj(x))  # (B, L', LLM_DIM)

class GPT2EmotionModel(nn.Module):
    def __init__(self, config, in_channels=13):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(config['gpt_path'])
        self.llm = AutoModel.from_pretrained(config['gpt_path']).to(DEVICE)
        for param in self.llm.parameters():
            param.requires_grad = False

        self.patch_embed = PatchEmbedding(in_channels, config['d_model'],
                                          config['patch_len'], config['stride'], config['dropout'])
        self.reprogram = ReprogrammingLayer(config['d_model'], config['llm_dim'], config['dropout'])
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Linear(config['llm_dim'], config['num_classes'])

    def forward(self, x):  # x: (B, C, L)
        x = self.patch_embed(x)  # (B, L', D)
        x = self.reprogram(x)    # (B, L', LLM_DIM)
        out = self.llm(inputs_embeds=x).last_hidden_state  # (B, L', LLM_DIM)
        pooled = self.pool(out.transpose(1, 2)).squeeze(-1)  # (B, LLM_DIM)
        return self.classifier(pooled)

# ==== DATASET ====
class EmotionDataset(Dataset):
    def __init__(self, X, y, train=True):
        self.X = torch.tensor(X, dtype=torch.float32)  # (N, 13, 96, 1)
        self.y = torch.tensor(np.argmax(y, axis=1), dtype=torch.long)
        self.train = train

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx].squeeze(-1)  # (13, 96)
        if self.train:
            x = x + torch.randn_like(x) * 0.01
        return x, self.y[idx]

# ==== LOAD DATA ====
X_train = np.load("processed_npy/X_train.npy")
y_train = np.load("processed_npy/y_train.npy")
X_val = np.load("processed_npy/X_val.npy")
y_val = np.load("processed_npy/y_val.npy")
X_test = np.load("processed_npy/X_test.npy")
y_test = np.load("processed_npy/y_test.npy")

train_ds = EmotionDataset(X_train, y_train, train=True)
val_ds = EmotionDataset(X_val, y_val, train=False)
test_ds = EmotionDataset(X_test, y_test, train=False)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)
test_loader = DataLoader(test_ds, batch_size=32)

# ==== TRAINING ====
model = GPT2EmotionModel(config).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

for epoch in range(50):
    model.train()
    train_loss, train_correct = 0, 0
    total = 0
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        outputs = model(x)
        loss = criterion(outputs, y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item() * x.size(0)
        train_correct += (outputs.argmax(1) == y).sum().item()
        total += y.size(0)

    model.eval()
    val_correct = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            val_correct += (outputs.argmax(1) == y).sum().item()

    print(f"Epoch {epoch+1} | Train Acc: {train_correct/total:.4f} | Val Acc: {val_correct/len(val_ds):.4f}")
    scheduler.step()

# ==== FINAL EVALUATION ====
y_pred = []
model.eval()
with torch.no_grad():
    for x, _ in test_loader:
        x = x.to(DEVICE)
        preds = model(x).argmax(1).cpu().numpy()
        y_pred.extend(preds)

y_true = np.argmax(y_test, axis=1)
print("\n==== Test Evaluation ====")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average='weighted'))
print("Recall:", recall_score(y_true, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Epoch 1 | Train Acc: 0.1406 | Val Acc: 0.1384
Epoch 2 | Train Acc: 0.2321 | Val Acc: 0.3259
Epoch 3 | Train Acc: 0.2467 | Val Acc: 0.5714
Epoch 4 | Train Acc: 0.2958 | Val Acc: 0.4062
Epoch 5 | Train Acc: 0.5714 | Val Acc: 0.5938
Epoch 6 | Train Acc: 0.6116 | Val Acc: 0.7812
Epoch 7 | Train Acc: 0.8103 | Val Acc: 0.7946
Epoch 8 | Train Acc: 0.8527 | Val Acc: 0.8527
Epoch 9 | Train Acc: 0.9129 | Val Acc: 0.9330
Epoch 10 | Train Acc: 0.9520 | Val Acc: 0.9821
Epoch 11 | Train Acc: 0.9665 | Val Acc: 0.9821
Epoch 12 | Train Acc: 0.9777 | Val Acc: 0.9866
Epoch 13 | Train Acc: 0.9877 | Val Acc: 0.9955
Epoch 14 | Train Acc: 0.9944 | Val Acc: 0.9955
Epoch 15 | Train Acc: 0.9911 | Val Acc: 0.9955
Epoch 16 | Train Acc: 0.9933 | Val Acc: 0.9955
Epoch 17 | Train Acc: 0.9944 | Val Acc: 0.9955
Epoch 18 | Train Acc: 0.9955 | Val Acc: 0.9955
Epoch 19 | Train Acc: 0.9944 | Val Acc: 0.9955
Epoch 20 | Train Acc: 0.9967 | Val Acc: 0.9955
Epoch 21 | Train Acc: 0.9955 | Val Acc: 0.9955
Epoch 22 | Train Acc: 