In [69]:
from dataset_loader import IEMOCAPLoader

TRANSCRIPT_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\IEMOCAP_full_release\\Session1\\dialog\\transcriptions"
AUDIO_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\Audio_Output"
VIDEO_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\Video_Output"
LABEL_FILE = "C:\\Users\\aryan\\Documents\\Study\\Research\\scene_emotions.csv"

loader = IEMOCAPLoader(TRANSCRIPT_DIR, AUDIO_DIR, VIDEO_DIR, LABEL_FILE)
dataset = loader.get_dataset()

In [70]:
print(f"Loaded {len(dataset)} scenes.")
print("Sample entry:", dataset[0])

Loaded 151 scenes.
Sample entry: {'scene_id': 'Ses01F_impro01', 'transcript': "Excuse me.\nDo you have your forms?\nYeah.\nLet me see them.\nIs there a problem?\nWho told you to get in this line?\nYou did.\nYou were standing at the beginning and you directed me.\nOkay. But I didn't tell you to get in this line if you are filling out this particular form.\nWell what's the problem?  Let me change it.\nThis form is a Z.X.four.\nYou can't--  This is not the line for Z.X.four.  If you're going to fill out the Z.X.four, you need to have a different form of ID.\nWhat?  I'm getting an ID.  This is why I'm here.  My wallet was stolen.\nNo. I need another set of ID to prove this is actually you.\nHow am I supposed to get an ID without an ID?  How does a person get an ID in the first place?\nI don't know.  But I need an ID to pass this form along.  I can't just send it along without an ID.\nI'm here to get an ID.\nNo.  I need another ID, a separate one.\nLike what?  Like a birth certificate?\nA b

In [71]:
from text_embedding import BERTFeatureExtractor

In [72]:
# Extract just the transcript texts
texts = [item["transcript"] for item in dataset]

extractor = BERTFeatureExtractor()
text_embeddings = extractor.extract_features(texts)

print("Extracted features shape:", text_embeddings.shape)

Extracted features shape: torch.Size([151, 125, 768])


In [73]:
video_embedding_paths = [item["video_path"] for item in dataset]
audio_embedding_paths = [item["audio_path"] for item in dataset]
video_embeddings = []
audio_embeddings = []

In [74]:
import os
import numpy as np
import torch

for path in video_embedding_paths:
    if os.path.exists(path):
        try:
            embedding = np.load(path)
            video_embeddings.append(embedding)
            # print(f"Loaded video embedding from: {path} (Shape: {embedding.shape})")
        except Exception as e:
            print(f"Error loading video embedding from {path}: {e}")
    else:
        print(f"Video embedding file not found: {path}")
print("\nLoaded video embeddings")

for path in audio_embedding_paths:
    if os.path.exists(path):
        try:
            embedding = np.load(path)
            audio_embeddings.append(embedding)
            # print(f"Loaded audio embedding from: {path} (Shape: {embedding.shape})")
        except Exception as e:
            print(f"Error loading audio embedding from {path}: {e}")
    else:
        print(f"Audio embedding file not found: {path}")
print("\nLoaded audio embeddings")


Loaded video embeddings

Loaded audio embeddings


In [75]:
print(f"\nSuccessfully loaded {len(video_embeddings)} video embeddings.")
print(f"Successfully loaded {len(audio_embeddings)} audio embeddings.")


Successfully loaded 151 video embeddings.
Successfully loaded 151 audio embeddings.


In [76]:
video_embeddings = torch.from_numpy(np.stack(video_embeddings))
audio_embeddings = torch.from_numpy(np.stack(audio_embeddings))

In [77]:
print(video_embeddings.shape)
print(audio_embeddings.shape)

torch.Size([151, 256, 768])
torch.Size([151, 128, 768])


In [78]:
import torch
from transformers import AutoTokenizer, AutoModel

In [79]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [80]:
cls_embedding = model.embeddings.word_embeddings(torch.tensor([tokenizer.cls_token_id]))
sep_embedding = model.embeddings.word_embeddings(torch.tensor([tokenizer.sep_token_id]))

In [81]:
print("Shape of the CLS embedding:", cls_embedding.shape)
print("Shape of the SEP embedding:", sep_embedding.shape)

Shape of the CLS embedding: torch.Size([1, 768])
Shape of the SEP embedding: torch.Size([1, 768])


In [82]:
def concatenate_modalities(cls_token, sep_token, video_tensor, audio_tensor, text_tensor):
    fused = []
    for v, a, t in zip(video_tensor, audio_tensor, text_tensor):
        segments = [
            cls_token,  # (1, 768)
            v,          # (v_len, 768)
            sep_token,  # (1, 768)
            a,          # (a_len, 768)
            sep_token,  # (1, 768)
            t          # (t_len, 768)
        ]
        fused.append(torch.cat(segments, dim=0))  # (total_len, 768)

    return fused

In [83]:
fused_embeddings = concatenate_modalities(cls_embedding, sep_embedding, video_embeddings, audio_embeddings, text_embeddings)

In [84]:
print(fused_embeddings[0].shape)

torch.Size([512, 768])


In [85]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel
from torch.optim import AdamW
from transformers import BigBirdModel

In [86]:
# ----------------------------
# Dataset class
# ----------------------------
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        embeddings: torch.Tensor [num_samples, 512, 768]
        labels: torch.Tensor [num_samples]
        """
        self.embeddings = embeddings
        self.labels = labels
        print(f"Dataset initialized with {len(self.embeddings)} samples, {len(self.labels)} labels")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [87]:
# ----------------------------
# Model class
# ----------------------------
class BertEmbeddingClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Sequential(
            nn.LayerNorm(768),
            nn.Dropout(0.2),
            nn.Linear(768, num_classes)
        )

    def forward(self, inputs_embeds):
        """
        inputs_embeds: [batch_size, 512, 768]
        """
        outputs = self.bert(inputs_embeds=inputs_embeds)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        logits = self.classifier(cls_embedding)
        return logits

In [88]:
class BigBirdEmbeddingClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.bigbird = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
        self.classifier = nn.Sequential(
            nn.LayerNorm(768),
            nn.Dropout(0.2),
            nn.Linear(768, num_classes)
        )

    def forward(self, inputs_embeds):
        """
        inputs_embeds: [batch_size, seq_len, 768]
        """
        outputs = self.bigbird(inputs_embeds=inputs_embeds)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(cls_embedding)
        return logits

In [89]:
import torch.nn.functional as F

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_embeds, batch_labels in dataloader:
        batch_embeds = batch_embeds.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        logits = model(batch_embeds)

        log_probs = F.log_softmax(logits, dim=1)  # convert logits to log-probabilities
        loss = criterion(log_probs, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_embeds, batch_labels in dataloader:
            batch_embeds = batch_embeds.to(device)
            batch_labels = batch_labels.to(device)

            logits = model(batch_embeds)
            preds = torch.argmax(logits, dim=1)  # predicted classes (hard)

            true_labels = torch.argmax(batch_labels, dim=1)  # convert soft labels to hard

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(true_labels.cpu().numpy())

    return all_labels, all_preds

def evaluatedist(model, dataloader, device):
    model.eval()
    total_kl = 0.0
    total_mse = 0.0
    num_batches = 0

    with torch.no_grad():
        for batch_embeds, batch_labels in dataloader:
            batch_embeds = batch_embeds.to(device)
            batch_labels = batch_labels.to(device)  # [batch_size, num_classes]

            logits = model(batch_embeds)  # [batch_size, num_classes]
            pred_log_probs = F.log_softmax(logits, dim=1)
            pred_probs = torch.softmax(logits, dim=1)

            # KL divergence (predicted log probs vs ground-truth probs)
            kl_loss = F.kl_div(pred_log_probs, batch_labels, reduction='batchmean')

            # MSE between predicted and ground-truth probability vectors
            mse_loss = F.mse_loss(pred_probs, batch_labels)

            total_kl += kl_loss.item()
            total_mse += mse_loss.item()
            num_batches += 1

    avg_kl = total_kl / num_batches
    avg_mse = total_mse / num_batches

    return avg_kl, avg_mse



In [90]:
import torch

# Extract all emotions from one sample (assuming all have same keys)
all_emotions = list(dataset[0]['emotion_counts'].keys())

# Build a tensor of soft labels (probabilities) from dataset
labels_list = []
for sample in dataset:
    probs = [sample['emotion_counts'][emo] for emo in all_emotions]
    labels_list.append(probs)

labels_tensor = torch.tensor(labels_list, dtype=torch.float)
print("Labels tensor shape:", labels_tensor.shape)

embeddings_tensor = torch.stack(fused_embeddings).detach()

Labels tensor shape: torch.Size([151, 10])


In [91]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [None]:
# ----------------------------
# Main
# ----------------------------
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Get number of classes and samples
    num_classes = labels_tensor.shape[1]
    num_samples = labels_tensor.shape[0]


    hard_labels_for_split = torch.argmax(labels_tensor, dim=1).numpy()
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    all_accuracies = []
    all_f1_scores = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings_tensor, hard_labels_for_split)):
        print(f"\n--- Fold {fold+1} ---")

        train_embeds = embeddings_tensor[train_idx]
        train_labels = labels_tensor[train_idx]
        test_embeds = embeddings_tensor[test_idx]
        test_labels = labels_tensor[test_idx]

        train_dataset = EmbeddingDataset(train_embeds, train_labels)
        test_dataset = EmbeddingDataset(test_embeds, test_labels)

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

        model = BertEmbeddingClassifier(num_classes=num_classes).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5)
        criterion = nn.KLDivLoss(reduction="batchmean")  # for soft labels

        for epoch in range(20):  # You can adjust this
            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)

        all_labels, all_preds = evaluate(model, test_loader, device)

        avg_kl, avg_mse = evaluatedist(model, test_loader, device)
        print(f"KL Divergence: {avg_kl:.4f}, MSE: {avg_mse:.4f}")
        
        # F1 Score
        f1 = f1_score(all_labels, all_preds, average='weighted')
        print(f"Fold {fold+1} Weighted F1-score: {f1:.4f}")
        
        # Accuracy
        accuracy = accuracy_score(all_labels, all_preds)
        print(f"Fold {fold+1} Accuracy: {accuracy:.4f}")

        # Store metrics
        all_accuracies.append(accuracy)
        all_f1_scores.append(f1) 

        # print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

    print(f"\nAverage Weighted F1-score across folds: {np.mean(all_accuracies):.4f}")

Using device: cuda

--- Fold 1 ---
Dataset initialized with 120 samples, 120 labels
Dataset initialized with 31 samples, 31 labels




TypeError: unsupported format string passed to list.__format__