In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}



In [48]:
def generate_emotion_transcript(file_path):
    # Extract emotion code
    emotion_code = int(file_path.split("-")[2])
    emotion_map = {
        1: "neutral", 2: "calm", 3: "happy", 4: "sad",
        5: "angry", 6: "fearful", 7: "disgust", 8: "surprised"
    }
    emotion = emotion_map[emotion_code]
    
    # Optional: add more natural sentence stubs if needed
    emotion_text_map = {
        "neutral": "I feel neutral.",
        "calm": "I am calm and composed.",
        "happy": "I am very happy!",
        "sad": "I am feeling sad.",
        "angry": "I am really angry!",
        "fearful": "I am scared and anxious.",
        "disgust": "I feel disgusted.",
        "surprised": "I am so surprised!"
    }

    return emotion_text_map[emotion]


In [53]:
import os
import whisper
import pandas as pd
import torch
import warnings

# Suppress FP16 and CPU inference warnings
warnings.filterwarnings("ignore", message="Performing inference on CPU when CUDA is available")
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# Load Whisper model
model = whisper.load_model("base", device="cpu").to(torch.float32)

# Directory of audio files
audio_root = "/mnt/e/Epoch_Spring_Camp/MegaTask/data/ravdess"
transcripts = []

# Config
file_count = 0
max_files = 100

# Emotion mappings
emotion_map = {
    1: "neutral", 2: "calm", 3: "happy", 4: "sad",
    5: "angry", 6: "fearful", 7: "disgust", 8: "surprised"
}
emotion_text_map = {
    "neutral": "I feel neutral.",
    "calm": "I am calm and composed.",
    "happy": "I am very happy!",
    "sad": "I am feeling sad.",
    "angry": "I am really angry!",
    "fearful": "I am scared and anxious.",
    "disgust": "I feel disgusted.",
    "surprised": "I am so surprised!"
}

# Traverse RAVDESS directory
for root, _, files in os.walk(audio_root):
    for file in sorted(files):
        if not file.endswith(".wav"):
            continue

        try:
            file_path = os.path.join(root, file)
            parts = file.split("-")
            if len(parts) < 3:
                continue  # Skip bad filenames

            emotion_code = int(parts[2])
            emotion = emotion_map.get(emotion_code, "unknown")

            # Transcribe using Whisper
            result = model.transcribe(file_path)
            text = result["text"].strip()

            # Fallback to synthetic transcript if transcription is empty
            if len(text) < 5:
                text = emotion_text_map.get(emotion, "I have no words.")

            transcripts.append({
                "emotion": emotion,
                "transcript": text
            })
            file_count += 1

            # print(f"[{file_count:03d}] {file} → {emotion} → {text}")

            if file_count >= max_files:
                break

        except Exception as e:
            # print(f"Error processing {file}: {e}")
            continue

    if file_count >= max_files:
        break

# Save to CSV
df = pd.DataFrame(transcripts)
df.to_csv("ravdess_transcripts.csv", index=False)
print(f"\n Saved {file_count} transcripts to ravdess_transcripts.csv")


  checkpoint = torch.load(fp, map_location=device)



 Saved 100 transcripts to ravdess_transcripts.csv


In [56]:
df = pd.read_csv("ravdess_transcripts.csv")
df

Unnamed: 0,emotion,transcript
0,neutral,Kids are talking by the door.
1,neutral,Kids are talking by the door.
2,neutral,Dogs are sitting by the door.
3,neutral,Dogs are sitting by the door.
4,calm,Kids are talking by the door.
...,...,...
95,angry,Dogs are sitting by the door.
96,fearful,Kids are talking by the door.
97,fearful,Kids are talking by the door.
98,fearful,Dogs are sitting by the door.


In [59]:
import pandas as pd

# Load your existing CSV
df = pd.read_csv("ravdess_transcripts.csv")

# Map of additional emotion-based text
emotion_appendix = {
    "neutral": "This sounds neutral.",
    "calm": "The speaker is calm.",
    "happy": "The speaker expresses happiness.",
    "sad": "There is sadness in the voice.",
    "angry": "This speech is filled with anger.",
    "fearful": "Fear is evident in this sentence.",
    "disgust": "This is a voice of disgust.",
    "surprised": "The speaker is surprised."
}

# Modify transcripts
df["augmented_transcript"] = df.apply(
    lambda row: f"{row['transcript']} {emotion_appendix.get(row['emotion'], '')}", axis=1
)

# Save the updated CSV
df.to_csv("ravdess_transcripts.csv", index=False)
print("✅ Updated transcripts saved to ravdess_transcripts.csv")


✅ Updated transcripts saved to ravdess_transcripts.csv


Unnamed: 0,emotion,transcript,augmented_transcript
0,neutral,Kids are talking by the door.,Kids are talking by the door. This sounds neut...
1,neutral,Kids are talking by the door.,Kids are talking by the door. This sounds neut...
2,neutral,Dogs are sitting by the door.,Dogs are sitting by the door. This sounds neut...
3,neutral,Dogs are sitting by the door.,Dogs are sitting by the door. This sounds neut...
4,calm,Kids are talking by the door.,Kids are talking by the door. The speaker is c...
...,...,...,...
95,angry,Dogs are sitting by the door.,Dogs are sitting by the door. This speech is f...
96,fearful,Kids are talking by the door.,Kids are talking by the door. Fear is evident ...
97,fearful,Kids are talking by the door.,Kids are talking by the door. Fear is evident ...
98,fearful,Dogs are sitting by the door.,Dogs are sitting by the door. Fear is evident ...


In [76]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/adishesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [73]:
# Tokenize and build vocabulary
def build_vocab(texts, min_freq=1):
    tokens = [word_tokenize(text.lower()) for text in texts]
    flat_tokens = [word for sublist in tokens for word in sublist]
    freq = Counter(flat_tokens)
    vocab = {word: i+2 for i, (word, count) in enumerate(freq.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

# Encode sentence
def encode_sentence(text, vocab, max_len=50):
    tokens = word_tokenize(text.lower())
    ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    return ids[:max_len] + [vocab['<PAD>']] * (max_len - len(ids))


In [74]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=50):
        self.vocab = vocab
        self.max_len = max_len
        self.data = [encode_sentence(t, vocab, max_len) for t in texts]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])


In [69]:
df

Unnamed: 0,emotion,transcript,augmented_transcript,label
0,neutral,Kids are talking by the door.,Kids are talking by the door. This sounds neut...,5
1,neutral,Kids are talking by the door.,Kids are talking by the door. This sounds neut...,5
2,neutral,Dogs are sitting by the door.,Dogs are sitting by the door. This sounds neut...,5
3,neutral,Dogs are sitting by the door.,Dogs are sitting by the door. This sounds neut...,5
4,calm,Kids are talking by the door.,Kids are talking by the door. The speaker is c...,1
...,...,...,...,...
95,angry,Dogs are sitting by the door.,Dogs are sitting by the door. This speech is f...,0
96,fearful,Kids are talking by the door.,Kids are talking by the door. Fear is evident ...,3
97,fearful,Kids are talking by the door.,Kids are talking by the door. Fear is evident ...,3
98,fearful,Dogs are sitting by the door.,Dogs are sitting by the door. Fear is evident ...,3


In [75]:
# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['emotion'])

# Build vocab
vocab = build_vocab(df['augmented_transcript'])
max_len = 50

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['augmented_transcript'], df['label'], test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = EmotionDataset(train_texts.tolist(), train_labels.tolist(), vocab, max_len)
test_dataset = EmotionDataset(test_texts.tolist(), test_labels.tolist(), vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/adishesh/nltk_data'
    - '/home/adishesh/pytorch_env/nltk_data'
    - '/home/adishesh/pytorch_env/share/nltk_data'
    - '/home/adishesh/pytorch_env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [66]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)
        out = self.fc(h_n.squeeze(0))
        return out


In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleRNN(vocab_size=len(vocab), embed_dim=64, hidden_dim=128, output_dim=len(le.classes_))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
