In [None]:
!pip install librosa
!pip install transformers
!pip install soundfile
!pip install torch torchaudio


In [None]:
from google.colab import drive
drive.mount('/content/drive')


#Speech Pipeline

In [None]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from transformers import BertTokenizer, BertModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

In [None]:
import os
from sklearn.model_selection import train_test_split

dataset_path = "/content/drive/MyDrive/TESS Toronto emotional speech set data"

emotion_files = {}

for folder in os.listdir(dataset_path):

    folder_path = os.path.join(dataset_path, folder)

    if not os.path.isdir(folder_path):
        continue

    for file in os.listdir(folder_path):

        if file.endswith(".wav"):

            emotion = file.split("_")[-1].replace(".wav","")

            if emotion not in emotion_files:
                emotion_files[emotion] = []

            emotion_files[emotion].append(os.path.join(folder_path, file))

In [None]:
train_paths = []
train_labels = []

val_paths = []
val_labels = []

test_paths = []
test_labels = []

emotion_map = {
    "angry":0,
    "disgust":1,
    "fear":2,
    "happy":3,
    "pleasant_surprise":4,
    "ps":4,
    "sad":5,
    "neutral":6
}

for emotion, files in emotion_files.items():

    train_files, temp_files = train_test_split(
        files,
        test_size=0.2,
        random_state=42,
        shuffle=True
    )

    val_files, test_files = train_test_split(
        temp_files,
        test_size=0.5,
        random_state=42,
        shuffle=True
    )

    train_paths.extend(train_files)
    val_paths.extend(val_files)
    test_paths.extend(test_files)

    train_labels.extend([emotion_map[emotion]] * len(train_files))
    val_labels.extend([emotion_map[emotion]] * len(val_files))
    test_labels.extend([emotion_map[emotion]] * len(test_files))

print("Train samples:", len(train_paths))
print("Validation samples:", len(val_paths))
print("Test samples:", len(test_paths))

In [None]:
project_root = "/content/drive/MyDrive/New_Project_pipeline2"

hubert_train_dir = os.path.join(project_root, "hubert_embeddings_train")
hubert_val_dir   = os.path.join(project_root, "hubert_embeddings_val")
hubert_test_dir  = os.path.join(project_root, "hubert_embeddings_test")

os.makedirs(hubert_train_dir, exist_ok=True)
os.makedirs(hubert_val_dir, exist_ok=True)
os.makedirs(hubert_test_dir, exist_ok=True)

print("New Project structure ready.")

In [None]:
import torch
import librosa
from transformers import HubertModel, Wav2Vec2FeatureExtractor

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)
hubert_model.eval()

In [None]:
def extract_hubert(audio_path):

    speech, sr = librosa.load(audio_path, sr=16000)

    inputs = feature_extractor(
        speech,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    input_values = inputs.input_values.to(device)

    with torch.no_grad():
        outputs = hubert_model(input_values)

    embedding = outputs.last_hidden_state.squeeze(0)

    return embedding.cpu()

In [None]:
from tqdm import tqdm

corrupted = []

for path in tqdm(train_paths):

    try:
        emb = extract_hubert(path)

        file_name = path.split("/")[-1].replace(".wav", ".pt")
        torch.save(emb, os.path.join(hubert_train_dir, file_name))

    except:
        corrupted.append(path)

print("Corrupted train files:", len(corrupted))

In [None]:
corrupted_val = []

for path in tqdm(val_paths):

    try:
        emb = extract_hubert(path)

        file_name = path.split("/")[-1].replace(".wav", ".pt")
        torch.save(emb, os.path.join(hubert_val_dir, file_name))

    except:
        corrupted_val.append(path)

print("Corrupted val files:", len(corrupted_val))

In [None]:
corrupted_test = []

for path in tqdm(test_paths):

    try:
        emb = extract_hubert(path)

        file_name = path.split("/")[-1].replace(".wav", ".pt")
        torch.save(emb, os.path.join(hubert_test_dir, file_name))

    except:
        corrupted_test.append(path)

print("Corrupted test files:", len(corrupted_test))

In [None]:
def load_train_embedding(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(hubert_train_dir, file_name))

def load_val_embedding(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(hubert_val_dir, file_name))

def load_test_embedding(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(hubert_test_dir, file_name))

In [None]:
class EmotionBiLSTM(nn.Module):
    def __init__(self):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=768,
            hidden_size=128,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Linear(256, 7)

    def forward(self, x, return_features=False):

        x = x.unsqueeze(0)
        out, _ = self.lstm(x)
        pooled = out.mean(dim=1)

        if return_features:
            return pooled

        output = self.fc(pooled)
        return output

In [None]:
model = EmotionBiLSTM().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

epochs = 20
epoch_losses = []
val_losses = []

for epoch in range(epochs):

    # TRAIN
    model.train()
    total_loss = 0

    for path, label in zip(train_paths, train_labels):

        features = load_train_embedding(path).to(device)
        label_tensor = torch.tensor([label]).to(device)

        outputs = model(features)
        loss = criterion(outputs, label_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_losses.append(total_loss)

    # VALIDATION
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for path, label in zip(val_paths, val_labels):

            features = load_val_embedding(path).to(device)
            label_tensor = torch.tensor([label]).to(device)

            outputs = model(features)
            loss = criterion(outputs, label_tensor)

            total_val_loss += loss.item()

    val_losses.append(total_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Loss: {total_val_loss:.4f}")

In [None]:
torch.save(model.state_dict(),
           "/content/drive/MyDrive/New_Project_pipeline2/final_hubert_bilstm.pt")

In [None]:
speech_train_pooled_dir = "/content/drive/MyDrive/New_Project_pipeline2/speech_train_pooled"
speech_val_pooled_dir   = "/content/drive/MyDrive/New_Project_pipeline2/speech_val_pooled"
speech_test_pooled_dir  = "/content/drive/MyDrive/New_Project_pipeline2/speech_test_pooled"

os.makedirs(speech_train_pooled_dir, exist_ok=True)
os.makedirs(speech_val_pooled_dir, exist_ok=True)
os.makedirs(speech_test_pooled_dir, exist_ok=True)

model.eval()

# ======================
# Train pooled
# ======================
for path in tqdm(train_paths):

    features = load_train_embedding(path).to(device)

    with torch.no_grad():
        pooled = model(features, return_features=True)

    file_name = path.split("/")[-1].replace(".wav",".pt")
    torch.save(pooled.cpu(),
               os.path.join(speech_train_pooled_dir, file_name))


# ======================
# Validation pooled
# ======================
for path in tqdm(val_paths):

    features = load_val_embedding(path).to(device)

    with torch.no_grad():
        pooled = model(features, return_features=True)

    file_name = path.split("/")[-1].replace(".wav",".pt")
    torch.save(pooled.cpu(),
               os.path.join(speech_val_pooled_dir, file_name))


# ======================
# Test pooled
# ======================
for path in tqdm(test_paths):

    features = load_test_embedding(path).to(device)

    with torch.no_grad():
        pooled = model(features, return_features=True)

    file_name = path.split("/")[-1].replace(".wav",".pt")
    torch.save(pooled.cpu(),
               os.path.join(speech_test_pooled_dir, file_name))

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.manifold import TSNE

# =========================
# SET RESULT DIRECTORY
# =========================
results_dir = "/content/drive/MyDrive/New_Project_pipeline2/Results"
os.makedirs(results_dir, exist_ok=True)

emotion_names = [
    "angry",
    "disgust",
    "fear",
    "happy",
    "pleasant_surprise",
    "sad",
    "neutral"
]

# =========================
# EVALUATION
# =========================
model.eval()

all_preds = []
all_labels = []
temporal_features = []

for path, label in zip(test_paths, test_labels):

    features = load_test_embedding(path).to(device)

    with torch.no_grad():
        pooled = model(features, return_features=True)
        outputs = model(features)

    pred = torch.argmax(outputs, dim=1).item()

    all_preds.append(pred)
    all_labels.append(label)
    temporal_features.append(pooled.cpu().numpy())

temporal_features = np.vstack(temporal_features)

# =========================
# ACCURACY
# =========================
accuracy = accuracy_score(all_labels, all_preds)
print("Speech Accuracy:", accuracy)

# Save accuracy table
accuracy_df = pd.DataFrame({
    "Model": ["Speech (HuBERT + BiLSTM)"],
    "Accuracy": [accuracy]
})

accuracy_df.to_csv(os.path.join(results_dir, "speech_accuracy.csv"), index=False)

# =========================
# CLASSIFICATION REPORT
# =========================
report = classification_report(all_labels, all_preds, target_names=emotion_names)

print(report)

with open(os.path.join(results_dir, "speech_classification_report.txt"), "w") as f:
    f.write(report)

# =========================
# CONFUSION MATRIX
# =========================
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8,6))
sns.heatmap(cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=emotion_names,
            yticklabels=emotion_names)

plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Speech Model")

plt.tight_layout()
plt.savefig(os.path.join(results_dir, "speech_confusion_matrix.png"))
plt.close()

# =============================
# Normalize embeddings first
# =============================
scaler = StandardScaler()
temporal_features_norm = scaler.fit_transform(temporal_features)

# =============================
# t-SNE (improved parameters)
# =============================
tsne = TSNE(
    n_components=2,
    perplexity=30,
    learning_rate=200,
    n_iter=2000,
    random_state=42,
    init="pca"
)

tsne_2d = tsne.fit_transform(temporal_features_norm)

# =============================
# Plot
# =============================
plt.figure(figsize=(10,8))

palette = sns.color_palette("tab10", len(emotion_names))

for i, emo in enumerate(emotion_names):
    idx = np.where(np.array(all_labels) == i)
    plt.scatter(
        tsne_2d[idx,0],
        tsne_2d[idx,1],
        label=emo,
        s=45
    )

plt.title("Temporal Modelling Representation (BiLSTM)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend()
plt.grid(alpha=0.2)

plt.tight_layout()
plt.savefig(os.path.join(results_dir, "speech_tsne.png"), dpi=300)
plt.show()

print("All results saved inside:", results_dir)

#Text Pipline

In [None]:
def build_text_dataset(paths):
    texts = []
    for path in paths:
        file = path.split("/")[-1]
        word = file.split("_")[1]
        sentence = f"say the word {word}"
        texts.append(sentence)
    return texts

train_texts = build_text_dataset(train_paths)
val_texts   = build_text_dataset(val_paths)
test_texts  = build_text_dataset(test_paths)

print(train_texts[:5])

In [None]:
import torch
from transformers import BertTokenizer, BertModel

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

bert_model.eval()

In [None]:
project_root = "/content/drive/MyDrive/New_Project_pipeline2"

text_train_dir = os.path.join(project_root, "bert_embeddings_train")
text_val_dir   = os.path.join(project_root, "bert_embeddings_val")
text_test_dir  = os.path.join(project_root, "bert_embeddings_test")

os.makedirs(text_train_dir, exist_ok=True)
os.makedirs(text_val_dir, exist_ok=True)
os.makedirs(text_test_dir, exist_ok=True)

In [None]:
from tqdm import tqdm

def extract_cls(text):

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=16
    ).to(device)

    with torch.no_grad():
        outputs = bert_model(**inputs)

    cls_embedding = outputs.last_hidden_state[:,0,:]

    return cls_embedding.cpu()


# ======================
# TRAIN embeddings
# ======================
for text, path in tqdm(zip(train_texts, train_paths), total=len(train_texts)):

    emb = extract_cls(text)

    file_name = path.split("/")[-1].replace(".wav", ".pt")
    torch.save(emb, os.path.join(text_train_dir, file_name))


# ======================
# VALIDATION embeddings
# ======================
for text, path in tqdm(zip(val_texts, val_paths), total=len(val_texts)):

    emb = extract_cls(text)

    file_name = path.split("/")[-1].replace(".wav", ".pt")
    torch.save(emb, os.path.join(text_val_dir, file_name))


# ======================
# TEST embeddings
# ======================
for text, path in tqdm(zip(test_texts, test_paths), total=len(test_texts)):

    emb = extract_cls(text)

    file_name = path.split("/")[-1].replace(".wav", ".pt")
    torch.save(emb, os.path.join(text_test_dir, file_name))

In [None]:
def load_text_train(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_train_dir, file_name))

def load_text_val(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_val_dir, file_name))

def load_text_test(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_test_dir, file_name))

In [None]:
import torch.nn as nn

class TextEmotionClassifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=7):
        super().__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

In [None]:
text_model = TextEmotionClassifier().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(text_model.parameters(), lr=2e-5)

epochs = 20
epoch_losses_text = []
val_losses_text = []

for epoch in range(epochs):

    # ======================
    # TRAIN
    # ======================
    text_model.train()
    total_loss = 0

    for path, label in zip(train_paths, train_labels):

        emb = load_text_train(path).to(device)
        label_tensor = torch.tensor([label]).to(device)

        preds = text_model(emb)
        loss = criterion(preds, label_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_losses_text.append(total_loss)

    # ======================
    # VALIDATION
    # ======================
    text_model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for path, label in zip(val_paths, val_labels):

            emb = load_text_val(path).to(device)
            label_tensor = torch.tensor([label]).to(device)

            preds = text_model(emb)
            loss = criterion(preds, label_tensor)

            total_val_loss += loss.item()

    val_losses_text.append(total_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Loss: {total_val_loss:.4f}")

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

results_dir = "/content/drive/MyDrive/New_Project_pipeline2/Results"
os.makedirs(results_dir, exist_ok=True)

emotion_names = [
    "angry",
    "disgust",
    "fear",
    "happy",
    "pleasant_surprise",
    "sad",
    "neutral"
]

text_model.eval()

all_preds = []
all_labels = []
context_features = []

for path, label in zip(test_paths, test_labels):

    emb = load_text_test(path).to(device)

    with torch.no_grad():
        outputs = text_model(emb)

    pred = torch.argmax(outputs, dim=1).item()

    all_preds.append(pred)
    all_labels.append(label)
    context_features.append(emb.cpu().numpy())

context_features = np.vstack(context_features)

# Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print("Text Accuracy:", accuracy)

pd.DataFrame({
    "Model": ["Text (BERT CLS)"],
    "Accuracy": [accuracy]
}).to_csv(os.path.join(results_dir, "text_accuracy.csv"), index=False)

# Classification Report
report = classification_report(all_labels, all_preds, target_names=emotion_names)
print(report)

with open(os.path.join(results_dir, "text_classification_report.txt"), "w") as f:
    f.write(report)

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8,6))
sns.heatmap(cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=emotion_names,
            yticklabels=emotion_names)

plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Text Model")
plt.tight_layout()
plt.savefig(os.path.join(results_dir, "text_confusion_matrix.png"))
plt.close()

# =============================
# Normalize CLS embeddings
# =============================
scaler = StandardScaler()
context_features_norm = scaler.fit_transform(context_features)

# =============================
# t-SNE with improved settings
# =============================
tsne = TSNE(
    n_components=2,
    perplexity=30,
    learning_rate=200,
    n_iter=2000,
    random_state=42,
    init="pca"
)

tsne_2d = tsne.fit_transform(context_features_norm)

# =============================
# Plot
# =============================
plt.figure(figsize=(10,8))

for i, emo in enumerate(emotion_names):
    idx = np.where(np.array(all_labels) == i)
    plt.scatter(
        tsne_2d[idx, 0],
        tsne_2d[idx, 1],
        label=emo,
        s=45
    )

plt.title("Contextual Modelling Representation (BERT CLS)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend()
plt.grid(alpha=0.2)

plt.tight_layout()
plt.savefig(os.path.join(results_dir, "text_tsne.png"), dpi=300)
plt.show()

print("Text pipeline results saved.")

#Fusion Pipeline

In [None]:
project_root = "/content/drive/MyDrive/New_Project_pipeline2"

speech_train_pooled_dir = os.path.join(project_root, "speech_train_pooled")
speech_val_pooled_dir   = os.path.join(project_root, "speech_val_pooled")
speech_test_pooled_dir  = os.path.join(project_root, "speech_test_pooled")

text_train_dir = os.path.join(project_root, "bert_embeddings_train")
text_val_dir   = os.path.join(project_root, "bert_embeddings_val")
text_test_dir  = os.path.join(project_root, "bert_embeddings_test")

In [None]:
def load_speech_train(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(speech_train_pooled_dir, file_name))

def load_speech_val(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(speech_val_pooled_dir, file_name))

def load_speech_test(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(speech_test_pooled_dir, file_name))


def load_text_train(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_train_dir, file_name))

def load_text_val(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_val_dir, file_name))

def load_text_test(path):
    file_name = path.split("/")[-1].replace(".wav", ".pt")
    return torch.load(os.path.join(text_test_dir, file_name))

In [None]:
import torch.nn as nn

class FusionEmotionModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.fc = nn.Linear(1024, 7)

    def forward(self, speech_emb, text_emb, return_features=False):

        fused = torch.cat((speech_emb, text_emb), dim=1)

        if return_features:
            return fused

        output = self.fc(fused)
        return output

In [None]:
fusion_model = FusionEmotionModel().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(fusion_model.parameters(), lr=3e-4)

epochs = 20
epoch_losses_fusion = []
val_losses_fusion = []

for epoch in range(epochs):

    # ======================
    # TRAIN
    # ======================
    fusion_model.train()
    total_loss = 0

    for path, label in zip(train_paths, train_labels):

        speech_emb = load_speech_train(path)
        text_emb   = load_text_train(path)

        if speech_emb is None or text_emb is None:
            continue

        speech_emb = speech_emb.to(device)
        text_emb   = text_emb.to(device)

        label_tensor = torch.tensor([label]).to(device)

        preds = fusion_model(speech_emb, text_emb)
        loss = criterion(preds, label_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_losses_fusion.append(total_loss)

    # ======================
    # VALIDATION
    # ======================
    fusion_model.eval()
    total_val_loss = 0

    with torch.no_grad():

        for path, label in zip(val_paths, val_labels):

            speech_emb = load_speech_val(path)
            text_emb   = load_text_val(path)

            if speech_emb is None or text_emb is None:
                continue

            speech_emb = speech_emb.to(device)
            text_emb   = text_emb.to(device)

            label_tensor = torch.tensor([label]).to(device)

            preds = fusion_model(speech_emb, text_emb)
            loss = criterion(preds, label_tensor)

            total_val_loss += loss.item()

    val_losses_fusion.append(total_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Loss: {total_val_loss:.4f}")

In [None]:
from sklearn.metrics import accuracy_score, classification_report

fusion_model.eval()

fusion_preds = []
fusion_labels = []
fusion_features = []

for path, label in zip(test_paths, test_labels):

    speech_emb = load_speech_test(path)
    text_emb   = load_text_test(path)

    if speech_emb is None or text_emb is None:
        continue

    speech_emb = speech_emb.to(device)
    text_emb   = text_emb.to(device)

    with torch.no_grad():
        outputs = fusion_model(speech_emb, text_emb)

    pred = torch.argmax(outputs, dim=1).item()

    fusion_preds.append(pred)
    fusion_labels.append(label)

    fused_vector = torch.cat((speech_emb, text_emb), dim=1)
    fusion_features.append(fused_vector.cpu().numpy())

fusion_features = np.vstack(fusion_features)

acc_fusion = accuracy_score(fusion_labels, fusion_preds)

print("Fusion Accuracy:", acc_fusion)
print("\nClassification Report:\n")
print(classification_report(fusion_labels, fusion_preds))

In [None]:
results_dir = "/content/drive/MyDrive/New_Project_pipeline2/Results"
os.makedirs(results_dir, exist_ok=True)

# Accuracy table
pd.DataFrame({
    "Model": ["Fusion (Speech + Text)"],
    "Accuracy": [acc_fusion]
}).to_csv(os.path.join(results_dir, "fusion_accuracy.csv"), index=False)

# Classification report
with open(os.path.join(results_dir, "fusion_classification_report.txt"), "w") as f:
    f.write(classification_report(fusion_labels, fusion_preds))

In [None]:
cm = confusion_matrix(fusion_labels, fusion_preds)

plt.figure(figsize=(8,6))
sns.heatmap(cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=emotion_names,
            yticklabels=emotion_names)

plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Fusion Confusion Matrix")

plt.tight_layout()
plt.savefig(os.path.join(results_dir, "fusion_confusion_matrix.png"))
plt.close()

In [None]:
fusion_logits = []
fusion_labels = []

fusion_model.eval()

for path, label in zip(test_paths, test_labels):

    speech_emb = load_speech_test(path)
    text_emb   = load_text_test(path)

    if speech_emb is None or text_emb is None:
        continue

    speech_emb = speech_emb.to(device)
    text_emb   = text_emb.to(device)

    with torch.no_grad():
        logits = fusion_model(speech_emb, text_emb)

    fusion_logits.append(logits.cpu().numpy())
    fusion_labels.append(label)

fusion_logits = np.vstack(fusion_logits)

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
logits_norm = scaler.fit_transform(fusion_logits)

tsne = TSNE(
    n_components=2,
    perplexity=30,
    n_iter=2000,
    random_state=42
)

fusion_2d = tsne.fit_transform(logits_norm)

plt.figure(figsize=(10,8))

for i, emo in enumerate(emotion_names):
    idx = np.where(np.array(fusion_labels) == i)
    plt.scatter(fusion_2d[idx,0], fusion_2d[idx,1], label=emo, s=50)

plt.title("Fusion Decision Space (Logits) t-SNE")
plt.legend()
plt.savefig(os.path.join(results_dir, "fusion_tsne.png"), dpi=300)
plt.show()