In [25]:
from dataset_loader import IEMOCAPLoader

TRANSCRIPT_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\IEMOCAP_full_release\\Session1\\dialog\\transcriptions"
AUDIO_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\Audio_Output"
VIDEO_DIR = "C:\\Users\\aryan\\Documents\\Study\\Research\\Video_Output"
LABEL_FILE = "C:\\Users\\aryan\\Documents\\Study\\Research\\scene_emotions.csv"

loader = IEMOCAPLoader(TRANSCRIPT_DIR, AUDIO_DIR, VIDEO_DIR, LABEL_FILE)
dataset = loader.get_dataset()

In [26]:
print(f"Loaded {len(dataset)} scenes.")
print("Sample entry:", dataset[0])

Loaded 151 scenes.
Sample entry: {'scene_id': 'Ses01F_impro01', 'transcript': "Excuse me.\nDo you have your forms?\nYeah.\nLet me see them.\nIs there a problem?\nWho told you to get in this line?\nYou did.\nYou were standing at the beginning and you directed me.\nOkay. But I didn't tell you to get in this line if you are filling out this particular form.\nWell what's the problem?  Let me change it.\nThis form is a Z.X.four.\nYou can't--  This is not the line for Z.X.four.  If you're going to fill out the Z.X.four, you need to have a different form of ID.\nWhat?  I'm getting an ID.  This is why I'm here.  My wallet was stolen.\nNo. I need another set of ID to prove this is actually you.\nHow am I supposed to get an ID without an ID?  How does a person get an ID in the first place?\nI don't know.  But I need an ID to pass this form along.  I can't just send it along without an ID.\nI'm here to get an ID.\nNo.  I need another ID, a separate one.\nLike what?  Like a birth certificate?\nA b

In [27]:
from text_embedding import BERTFeatureExtractor

In [28]:
# Extract just the transcript texts
texts = [item["transcript"] for item in dataset]

extractor = BERTFeatureExtractor()
text_embeddings = extractor.extract_features(texts)

print("Extracted features shape:", text_embeddings.shape)

Extracted features shape: torch.Size([151, 125, 768])


In [29]:
video_embedding_paths = [item["video_path"] for item in dataset]
audio_embedding_paths = [item["audio_path"] for item in dataset]
video_embeddings = []
audio_embeddings = []

In [30]:
import os
import numpy as np
import torch

for path in video_embedding_paths:
    if os.path.exists(path):
        try:
            embedding = np.load(path)
            video_embeddings.append(embedding)
            # print(f"Loaded video embedding from: {path} (Shape: {embedding.shape})")
        except Exception as e:
            print(f"Error loading video embedding from {path}: {e}")
    else:
        print(f"Video embedding file not found: {path}")
print("\nLoaded video embeddings")

for path in audio_embedding_paths:
    if os.path.exists(path):
        try:
            embedding = np.load(path)
            audio_embeddings.append(embedding)
            # print(f"Loaded audio embedding from: {path} (Shape: {embedding.shape})")
        except Exception as e:
            print(f"Error loading audio embedding from {path}: {e}")
    else:
        print(f"Audio embedding file not found: {path}")
print("\nLoaded audio embeddings")


Loaded video embeddings

Loaded audio embeddings


In [31]:
print(f"\nSuccessfully loaded {len(video_embeddings)} video embeddings.")
print(f"Successfully loaded {len(audio_embeddings)} audio embeddings.")


Successfully loaded 151 video embeddings.
Successfully loaded 151 audio embeddings.


In [32]:
video_embeddings = torch.from_numpy(np.stack(video_embeddings))
audio_embeddings = torch.from_numpy(np.stack(audio_embeddings))

In [33]:
print(video_embeddings.shape)
print(audio_embeddings.shape)

torch.Size([151, 256, 768])
torch.Size([151, 128, 768])


In [34]:
import torch
from transformers import AutoTokenizer, AutoModel

In [35]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [36]:
cls_embedding = model.embeddings.word_embeddings(torch.tensor([tokenizer.cls_token_id]))
sep_embedding = model.embeddings.word_embeddings(torch.tensor([tokenizer.sep_token_id]))

In [37]:
print("Shape of the CLS embedding:", cls_embedding.shape)
print("Shape of the SEP embedding:", sep_embedding.shape)

Shape of the CLS embedding: torch.Size([1, 768])
Shape of the SEP embedding: torch.Size([1, 768])


In [38]:
def concatenate_modalities(cls_token, sep_token, video_tensor, audio_tensor, text_tensor):
    fused = []
    for v, a, t in zip(video_tensor, audio_tensor, text_tensor):
        segments = [
            cls_token,  # (1, 768)
            v,          # (v_len, 768)
            sep_token,  # (1, 768)
            a,          # (a_len, 768)
            sep_token,  # (1, 768)
            t          # (t_len, 768)
        ]
        fused.append(torch.cat(segments, dim=0))  # (total_len, 768)

    return fused

In [39]:
fused_embeddings = concatenate_modalities(cls_embedding, sep_embedding, video_embeddings, audio_embeddings, text_embeddings)

In [40]:
print(fused_embeddings[0].shape)

torch.Size([512, 768])


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel
from torch.optim import AdamW

In [42]:
# ----------------------------
# Dataset class
# ----------------------------
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        embeddings: torch.Tensor [num_samples, 512, 768]
        labels: torch.Tensor [num_samples]
        """
        self.embeddings = embeddings
        self.labels = labels
        print(f"Dataset initialized with {len(self.embeddings)} samples, {len(self.labels)} labels")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [43]:
# ----------------------------
# Model class
# ----------------------------
class BertEmbeddingClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Sequential(
            nn.LayerNorm(768),
            nn.Dropout(0.2),
            nn.Linear(768, num_classes)
        )

    def forward(self, inputs_embeds):
        """
        inputs_embeds: [batch_size, 512, 768]
        """
        outputs = self.bert(inputs_embeds=inputs_embeds)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        logits = self.classifier(cls_embedding)
        return logits

In [44]:
# ----------------------------
# Training and evaluation functions
# ----------------------------
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_embeds, batch_labels in dataloader:
        batch_embeds = batch_embeds.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        logits = model(batch_embeds)
        loss = criterion(logits, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_embeds, batch_labels in dataloader:
            batch_embeds = batch_embeds.to(device)
            batch_labels = batch_labels.to(device)
            logits = model(batch_embeds)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    return all_labels, all_preds

In [45]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

# Load the label file
df = pd.read_csv("C:/Users/aryan/Documents/Study/Research/scene_emotions.csv")  # columns: scene_id, scene_emotion

# Print class distribution
print("Class distribution:\n", df['scene_emotion'].value_counts())

# Create label encoder to convert emotion strings to integer classes
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['scene_emotion'])

print("Classes:", label_encoder.classes_)
labels_tensor = torch.tensor(df['label'].values, dtype=torch.long)

embeddings_tensor = torch.stack(fused_embeddings).detach()

Class distribution:
 scene_emotion
Frustration      61
Excited          31
Neutral state    28
Sadness          17
Anger            10
Happiness         4
Name: count, dtype: int64
Classes: ['Anger' 'Excited' 'Frustration' 'Happiness' 'Neutral state' 'Sadness']


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [None]:
# ----------------------------
# Main
# ----------------------------
if __name__ == "__main__":
    # Example dummy data: replace with your actual data
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    num_classes = len(set(labels_tensor))
    num_samples = len(labels_tensor)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    all_accuracies = []
    all_f1_scores = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings_tensor, labels_tensor)):
        print(f"\n--- Fold {fold+1} ---")

        train_embeds = embeddings_tensor[train_idx]
        train_labels = labels_tensor[train_idx]
        test_embeds = embeddings_tensor[test_idx]
        test_labels = labels_tensor[test_idx]

        train_dataset = EmbeddingDataset(train_embeds, train_labels)
        test_dataset = EmbeddingDataset(test_embeds, test_labels)

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

        model = BertEmbeddingClassifier(num_classes=num_classes).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(20):  # You can adjust this
            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)

        all_labels, all_preds = evaluate(model, test_loader, device)
        
        # F1 Score
        f1 = f1_score(all_labels, all_preds, average='weighted')
        print(f"Fold {fold+1} Weighted F1-score: {f1:.4f}")
        
        # Accuracy
        accuracy = accuracy_score(all_labels, all_preds)
        print(f"Fold {fold+1} Accuracy: {accuracy:.4f}")

        # Store metrics
        all_accuracies.append(accuracy)
        all_f1_scores.append(f1) 

        print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

    print(f"\nAverage Weighted F1-score across folds: {np.mean(all_accuracies):.4f}")

Using device: cuda

--- Fold 1 ---
Dataset initialized with 120 samples, 120 labels
Dataset initialized with 31 samples, 31 labels




Fold 1 Weighted F1-score: 0.4812
               precision    recall  f1-score   support

        Anger       0.50      0.50      0.50         2
      Excited       0.57      0.57      0.57         7
  Frustration       0.59      0.77      0.67        13
    Happiness       0.00      0.00      0.00         1
Neutral state       0.33      0.20      0.25         5
      Sadness       0.00      0.00      0.00         3

     accuracy                           0.52        31
    macro avg       0.33      0.34      0.33        31
 weighted avg       0.46      0.52      0.48        31


--- Fold 2 ---
Dataset initialized with 121 samples, 121 labels
Dataset initialized with 30 samples, 30 labels
Fold 2 Weighted F1-score: 0.6182
               precision    recall  f1-score   support

        Anger       1.00      0.50      0.67         2
      Excited       0.83      0.83      0.83         6
  Frustration       0.64      0.58      0.61        12
    Happiness       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 3 Weighted F1-score: 0.5385
               precision    recall  f1-score   support

        Anger       0.00      0.00      0.00         2
      Excited       1.00      0.50      0.67         6
  Frustration       0.64      0.75      0.69        12
    Happiness       0.00      0.00      0.00         0
Neutral state       0.29      0.33      0.31         6
      Sadness       0.50      0.50      0.50         4

     accuracy                           0.53        30
    macro avg       0.40      0.35      0.36        30
 weighted avg       0.58      0.53      0.54        30


--- Fold 4 ---
Dataset initialized with 121 samples, 121 labels
Dataset initialized with 30 samples, 30 labels
Fold 4 Weighted F1-score: 0.5935
               precision    recall  f1-score   support

        Anger       1.00      0.50      0.67         2
      Excited       0.56      0.83      0.67         6
  Frustration       0.62      0.83      0.71        12
    Happiness       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5 Weighted F1-score: 0.4486
               precision    recall  f1-score   support

        Anger       0.00      0.00      0.00         2
      Excited       0.75      0.50      0.60         6
  Frustration       0.60      0.75      0.67        12
    Happiness       0.00      0.00      0.00         1
Neutral state       0.17      0.17      0.17         6
      Sadness       0.25      0.33      0.29         3

     accuracy                           0.47        30
    macro avg       0.29      0.29      0.29        30
 weighted avg       0.45      0.47      0.45        30


Average Weighted F1-score across folds: 0.5360


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
