# Full Emotion Detection Pipeline

This pipeline takes raw video input, transcribes the audio detected, translates the transcription sentences into English using a pretrained OPUS model, and uses a CamemBERT model to analyze emotions detected on the sentences.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel, MarianMTModel, MarianTokenizer, CamembertForSequenceClassification, CamembertTokenizer, pipeline, Trainer, TrainingArguments, get_scheduler, TFAutoModelForSeq2SeqLM
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.optim import AdamW
import torch
from tqdm import tqdm
import requests
import json
import assemblyai as aai
import re
import spacy
from textblob import TextBlob, Blobber
from textblob_fr import PatternAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from datasets import Dataset as HFDataset, load_dataset
import evaluate
import nlpaug.augmenter.word as naw
from googletrans import Translator
import time
from textattack.augmentation import EasyDataAugmenter
from IPython.display import display

# Ensure NLTK resources are available (run this once if you haven't before)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    from textblob_fr import PatternAnalyzer
except ImportError:
    print("Warning: textblob_fr might not be installed correctly.")

In [None]:
# 2. Extract audio from a video file

aai.settings.api_key = "b5e1d69d7be14bb291bff901bf4dc1c8"

audio_file = "/Users/daria/Desktop/2024-25c-fai2-adsai-dariavladutu236578/Week_1/extracted mp3s/Le Schtroumpf Navigateur • Les Schtroumpfs.mp3"

# Initialize transcriber with French language config
config = aai.TranscriptionConfig(language_code="fr")
transcriber = aai.Transcriber()

# Start transcription
transcript_id = transcriber.transcribe(audio_file, config=config)

# Wait for completion
transcript = transcriber.wait_for_completion(transcript_id)

if transcript.status == aai.TranscriptStatus.error:
    print(f"Transcription failed: {transcript.error}")
    exit(1)
#print(transcript.text)
# Extract sentences (assuming you want individual sentences in rows)
sentences = [{"Sentence": s.strip()} for s in re.split(r'(?<=[.!?]) +', transcript.text) if s.strip()]

# Convert to a DataFrame
df = pd.DataFrame(sentences)

# Specify output Excel file name
excel_filename = "Le_Schtroumpf_Navigateur_Les_Schtroumpfs.xlsx"

# Save to an Excel file
df.to_excel(excel_filename, index=False, engine="openpyxl")

print(f"Transcription saved as {excel_filename}")

In [None]:
# 3. Tokenize and calculate Word Error Rate (WER) for transcripts

# Load SpaCy French language model
nlp = spacy.load("fr_core_news_sm")

# File paths
file_paths = {
    "assemblyAI": r"Week_1/STT_Assembly.xlsx",
    "whisper": r"Week_1/STT_Whisper.xlsx"
}

# Function to tokenize text using SpaCy
def tokenize_french(text):
    doc = nlp(text.lower())  # Process text and lowercase
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]  # Remove punctuation & spaces
    return tokens

# Function to calculate Word Error Rate (WER)
def calculate_wer(df, token_column, s_col='S', i_col='I', d_col='D'):
    if s_col in df.columns and i_col in df.columns and d_col in df.columns:
        S = df[s_col].sum()
        I = df[i_col].sum()
        D = df[d_col].sum()
        N = df[token_column].explode().count()  # Total token count
        WER = (S + I + D) / N if N > 0 else 0
        return WER
    else:
        raise ValueError("Missing S, I, or D columns in the file.")

# Function to count tokens and compute WER
def process_transcript(file_path, header_row=0):
    df = pd.read_excel(file_path, header=header_row)
    
    # Identify the transcription column
    transcription_col = next((col for col in df.columns if "sentence" in col.lower() or "transcription" in col.lower()), None)

    if transcription_col is None:
        raise ValueError(f"No transcription column found in {file_path}.")
    
    # Tokenize and count tokens
    df['tokens'] = df[transcription_col].astype(str).apply(tokenize_french)
    total_tokens = df['tokens'].explode().count()
    
    # Calculate WER
    wer_score = calculate_wer(df, 'tokens')
    
    return df, total_tokens, wer_score

# Process both transcripts and compute WER
dfs = {}
wer_scores = {}

for model, path in file_paths.items():
    df, total_tokens, wer = process_transcript(path, header_row=0)
    dfs[model] = df
    wer_scores[model] = wer
    print(f"Total tokens in {model} transcript: {total_tokens}")
    print(f"WER for {model} transcript: {wer:.4f}")

## OUTPUT
# Total tokens in assemblyAI transcript: 1263
# WER for assemblyAI transcript: 0.0507
# Total tokens in whisper transcript: 1434
# WER for whisper transcript: 0.0704

In [None]:
# 4. Preprocessing the data 
# Load the data 
# Adding the Emotion detection dataset (with phrases from the Friends show)
import pandas as pd
import json
import numpy as np

# Load JSON file
with open("emotion-detection-emotion-detection-1.0/json/emotion-detection-trn.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
dataf = pd.DataFrame(data)

# Expand the 'episodes' column
episodes_list = []

for _, row in dataf.iterrows():
    season_id = row["season_id"]  # Track season ID

    # Check if the episodes column is a dictionary and process accordingly
    if isinstance(row['episodes'], dict):  # If episodes is already a dict
        episodes = [row['episodes']]  # Put it in a list for uniform processing
    elif isinstance(row['episodes'], str):  # If episodes is a string, try parsing it
        try:
            episodes = json.loads(row['episodes'])  # Convert string to dictionary
        except json.JSONDecodeError:
            print(f"Error decoding JSON for row {row.name}")
            continue
    else:
        episodes = []  # Empty list if structure is unexpected

    # Iterate through the episodes
    for episode in episodes:
        # Make sure episode is a dictionary and has the expected keys
        if isinstance(episode, dict) and "episode_id" in episode and "scenes" in episode:
            episode_id = episode["episode_id"]
            for scene in episode["scenes"]:
                # Make sure each scene has the expected structure
                if isinstance(scene, dict) and "scene_id" in scene and "utterances" in scene:
                    scene_id = scene["scene_id"]
                    for utterance in scene["utterances"]:
                        if isinstance(utterance, dict) and "utterance_id" in utterance:
                            episodes_list.append({
                                "season_id": season_id,
                                "episode_id": episode_id,
                                "scene_id": scene_id,
                                "utterance_id": utterance["utterance_id"],
                                "speaker": utterance["speakers"][0] if utterance["speakers"] else None,
                                "transcript": utterance["transcript"],
                                "emotion": utterance["emotion"]
                            })
        else:
            print(f"Unexpected episode structure: {episode}")

# Create a new DataFrame
df = pd.DataFrame(episodes_list)

# Copy the dataset in case we need to restart the preprocessing
df_copy = df.copy()

# Remove the unnecessary columns
columns_to_keep = ["transcript", "emotion"]
df_copy = df_copy[columns_to_keep]

# Replace their 7 labels ('Joyful' 'Neutral' 'Powerful' 'Mad' 'Sad' 'Scared' 'Peaceful') 
# with our 7 labels ("happiness", "sadness", "anger", "surprise", "fear", "disgust", "neutral")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Joyful", "happiness")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Neutral", "neutral")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Powerful", "happiness")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Mad", "anger")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Sad", "sadness")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Scared", "fear")
df_copy.loc[:, "emotion"] = df_copy["emotion"].str.replace("Peaceful", "happiness")

translator = Translator()

def translate_function(text):
    try:
        print(f"Translating: {text}")  # Add this to see if the function is being called
        if not isinstance(text, str) or text.strip() == "":
            print(f"Skipping invalid input: {text}")
            return text
        
        time.sleep(0.5)  # Avoid rate limits
        translated_text = translator.translate(text, dest="fr").text  # This should be synchronous
        
        if not translated_text:
            print(f"Empty translation for: {text}")
            return text  

        return translated_text
    
    except Exception as e:
        print(f"Error translating: {text} - {e}")
        return text

# Apply the translation function to the "Sentence" column
df_copy["Translated"] = df_copy["Sentence"].apply(translate_function)

# Rename the text column to resemble our transcribed dataset better
df_copy.rename(columns={"emotion": "Emotion", "transcript": "Sentence"}, inplace=True)
df2 = df_copy

# Switching the column names to match the final dataset
df2.rename(columns={'Translated': 'Sentence', 'Sentence': 'Translated'}, inplace=True)
df2 = df2[['Sentence'] + [col for col in df2.columns if col != 'Sentence']]

# Adding synthetic data to balance classes with the lowest nr of instances (sadness, surprise, and disgust)
transcript_3 = "synthetic_emotion_dataset.csv"
df3 = pd.read_csv(transcript_3)

# Adding the Go Emotions dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/sebdg/go_emotions_cleaned/" + splits["train"])

# Copy the dataset in case we need to restart the preprocessing
df_copy = df
df.dropna()

# Remove the unnecessary columns
columns_to_keep = ["text", "labels", "labels_text"]
df = df[columns_to_keep]

# Split the labels_text column into multiple
df = df.assign(labels_text=df["labels_text"].str.split(",")).explode("labels_text")
display(df.tail(10))

# Make all label names lowercase
df["labels_text"] = df["labels_text"].str.strip().str.lower()

# Replace the 28 ('admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 
# 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 
# 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral') labels 
# with our 7 labels ("happiness", "sadness", "anger", "surprise", "fear", "disgust", "neutral")
df["labels_text"] = df["labels_text"].str.replace("admiration", "happiness")
df["labels_text"] = df["labels_text"].str.replace("amusement", "happiness")
df["labels_text"] = df["labels_text"].str.replace("anger", "anger")
df["labels_text"] = df["labels_text"].str.replace("annoyance", "anger")
df["labels_text"] = df["labels_text"].str.replace("approval", "happiness")
df["labels_text"] = df["labels_text"].str.replace("caring", "happiness")
df["labels_text"] = df["labels_text"].str.replace("confusion", "surprise")
df["labels_text"] = df["labels_text"].str.replace("curiosity", "surprise")
df["labels_text"] = df["labels_text"].str.replace("desire", "happiness")
df["labels_text"] = df["labels_text"].str.replace("disappointment", "disgust")
df["labels_text"] = df["labels_text"].str.replace("disapproval", "disgust")
df["labels_text"] = df["labels_text"].str.replace("disgust", "disgust")
df["labels_text"] = df["labels_text"].str.replace("embarrassment", "sadness")
df["labels_text"] = df["labels_text"].str.replace("excitement", "happiness")
df["labels_text"] = df["labels_text"].str.replace("fear", "fear")
df["labels_text"] = df["labels_text"].str.replace("gratitude", "happiness")
df["labels_text"] = df["labels_text"].str.replace("grief", "sadness")
df["labels_text"] = df["labels_text"].str.replace("joy", "happiness")
df["labels_text"] = df["labels_text"].str.replace("love", "happiness")
df["labels_text"] = df["labels_text"].str.replace("nervousness", "fear")
df["labels_text"] = df["labels_text"].str.replace("optimism", "happiness")
df["labels_text"] = df["labels_text"].str.replace("pride", "happiness")
df["labels_text"] = df["labels_text"].str.replace("realization", "surprise")
df["labels_text"] = df["labels_text"].str.replace("relief", "happiness")
df["labels_text"] = df["labels_text"].str.replace("remorse", "sadness")
df["labels_text"] = df["labels_text"].str.replace("sadness", "sadness")
df["labels_text"] = df["labels_text"].str.replace("surprise", "surprise")
df["labels_text"] = df["labels_text"].str.replace("neutral", "neutral")
df["labels_text"] = df["labels_text"].str.replace("dishappiness", "sadness")

# Rename the text column to resemble our transcribed dataset better
df.rename(columns={"text": "Translated"}, inplace=True)

# Drop duplicates
df = df.drop_duplicates(subset=["Translated"], keep="first")

# Translate
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
 
# Optimize PyTorch for GPU
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
 
# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)  # Move model to GPU
 
# Warm-up GPU to avoid initial slow batch
dummy_input = torch.tensor([[0]]).to(device)  
 
# Function to translate sentences in batches
def translate(sentences, batch_size=8):  # Reduce batch size if stuck
    translated = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Translating", unit="batch"):
        batch = sentences[i:i+batch_size]
       
        # Tokenize & move input to GPU
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
       
        # Generate translation using GPU
        with torch.no_grad():
            translated_batch = model.generate(**inputs)
       
        # Decode output
        translated.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch])
   
    return translated
 
# Translate sentences
df['Sentence'] = translate(df['Translated'].tolist())

# Move sentence column first
df = df[['Sentence'] + [col for col in df.columns if col != 'Sentence']]
df.drop(columns=['labels'], inplace=True)
df = df.loc[:, ~df.columns.duplicated()]

# Drop encoded labels column
df.drop(columns=['labels'], inplace=True)
df.rename(columns = {'labels_text':'Emotion'})

df4 = df

# Concatanate datasets
df = pd.concat([df2, df3, df4], ignore_index=True)
display(df)

# Addind synthetic data to balance the classes
# Initialize TextAttack augmenter (you can experiment with different ones)
augmenter = EasyDataAugmenter(pct_words_to_swap=0.2, transformations_per_example=2)

# Define a threshold: Augment only classes below this threshold
THRESHOLD = 9000

# Apply augmentation to underrepresented classes
augmented_sentences = []
augmented_labels = []
class_counts = df["Emotion"].value_counts()
for emotion, count in class_counts.items():
    if count < THRESHOLD:
        subset = df[df["Emotion"] == emotion]
        for sentence in subset["Sentence"]:
            augmented = augmenter.augment(sentence)
            augmented_sentences.extend(augmented)
            augmented_labels.extend([emotion] * len(augmented))

# Create DataFrame for augmented data
augmented_df = pd.DataFrame({"Sentence": augmented_sentences, "Emotion": augmented_labels})

# Combine with original data
df = pd.concat([df, augmented_df]).reset_index(drop=True)
df = df[['Sentence', 'Emotion']]

df1 = df
# Add emotion pipeline output dataset, augment it with part of the other dataset
tqdm.pandas()

# Initialize French contextual augmenter (CamemBERT)
aug = naw.ContextualWordEmbsAug(
    model_path='camembert-base',
    action="substitute",
    device='cuda'  # Use 'cpu' if no GPU
)

# Set desired target for class size
TARGET_SIZE = 5000

# Initialize a list to hold augmented sentences
augmented_sentences = []
augmented_labels = []

# Iterate over each class and augment or sample
for emotion, count in df1['Emotion'].value_counts().items():
    print(f"Processing {emotion} class with {count} samples...")

    # Subset the data for the current emotion
    subset = df1[df1["Emotion"] == emotion]

    # If the class is 'fear', leave it untouched
    if emotion == 'fear':
        augmented_sentences.extend(subset["Sentence"].tolist())
        augmented_labels.extend([emotion] * len(subset))

    # If the class size is greater than TARGET_SIZE, sample down to TARGET_SIZE
    elif count > TARGET_SIZE:
        print(f"Sampling {emotion} class down to {TARGET_SIZE} samples...")
        augmented_sentences.extend(subset.sample(n=TARGET_SIZE, replace=False)["Sentence"].tolist())
        augmented_labels.extend([emotion] * TARGET_SIZE)

    # If the class size is less than TARGET_SIZE, augment it
    else:
        print(f"Augmenting {emotion} class...")
        augmented = subset["Sentence"].progress_apply(lambda x: aug.augment(x, n=2)[0]).tolist()
        
        # Collect augmented sentences and labels
        augmented_sentences.extend(augmented)
        augmented_labels.extend([emotion] * len(augmented))

# Create a DataFrame for the augmented data
augmented_df = pd.DataFrame({"Sentence": augmented_sentences, "Emotion": augmented_labels})

# Check the new class distribution
df_final = augmented_df.groupby("Emotion").apply(
    lambda x: x.sample(n=TARGET_SIZE, replace=False) if len(x) > TARGET_SIZE else x
).reset_index(drop=True)

df1=df_final

# Load pipeline output dataset
df2 = pd.read_csv('test_data/group 12_url1.csv')

# Keep only relevant columns
df2.drop(columns=['Start Time', 'End Time', 'Sentence', 'Translation', 'Intensity'], inplace=True)

df2.rename(columns={'Corrected Sentence': 'Sentence'}, inplace=True)

# Rename emotions to fit with our labels
df2["Emotion"] = df2["Emotion"].str.replace("excitement", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("confusion", "surprise")
df2["Emotion"] = df2["Emotion"].str.replace("annoyance", "anger")
df2["Emotion"] = df2["Emotion"].str.replace("disapproval", "disgust")
df2["Emotion"] = df2["Emotion"].str.replace("pride", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("joy", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("disappointment", "sadness")
df2["Emotion"] = df2["Emotion"].str.replace("optimism", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("admiration", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("approval", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("nervousness", "fear")
df2["Emotion"] = df2["Emotion"].str.replace("realization", "surprise")
df2["Emotion"] = df2["Emotion"].str.replace("gratitude", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("caring", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("love", "happiness")
df2["Emotion"] = df2["Emotion"].str.replace("remorse", "sadness")
df2["Emotion"] = df2["Emotion"].str.replace("embarrassment", "fear")
df2["Emotion"] = df2["Emotion"].str.replace("grief", "sadness")

# Remove unecessary rows that add to the 'happiness' class (there are enough samples)
df2 = df2[~df2['Emotion'].isin(['curiosity', 'desire', 'relief','amusement'])]

# Load your dataset (adjust path and column names if needed)
tqdm.pandas()

# Initialize French contextual augmenter (CamemBERT)
aug = naw.ContextualWordEmbsAug(
    model_path='camembert-base',
    action="substitute",
    device='cuda'
)

# Set desired target for class size
TARGET_SIZE = 5000

# Initialize a list to hold augmented sentences
augmented_sentences = []
augmented_labels = []

# Iterate over each class and augment
for emotion, count in df2['Emotion'].value_counts().items():
    print(f"Processing {emotion} class with {count} samples...")

    # Subset the data for the current emotion
    subset = df2[df2["Emotion"] == emotion]

    # If class size is less than the target, augment it
    if count < TARGET_SIZE:
        print(f"Augmenting {emotion} class...")
        # Augment each sentence in the subset
        augmented = subset["Sentence"].progress_apply(lambda x: aug.augment(x, n=2)[0]).tolist()
        
        # Collect augmented sentences and labels
        augmented_sentences.extend(augmented)
        augmented_labels.extend([emotion] * len(augmented))

    # If the class is already above the target size, just take the original samples
    else:
        augmented_sentences.extend(subset["Sentence"].tolist())
        augmented_labels.extend([emotion] * len(subset))

# Create a DataFrame for the augmented data
augmented_df = pd.DataFrame({"Sentence": augmented_sentences, "Emotion": augmented_labels})

# If there are still more than 5000 examples for a class, randomly sample to get exactly 5000
df_final = augmented_df.groupby("Emotion").apply(
    lambda x: x.sample(n=TARGET_SIZE, replace=True) if len(x) < TARGET_SIZE else x.sample(n=TARGET_SIZE, replace=False)
).reset_index(drop=True)

# Check the new class distribution
print("After Augmentation:\n", df_final["Emotion"].value_counts())

df2 = pd.concat([df2, df_final], ignore_index=True)
df = pd.concat([df1,df2], ignore_index=True)

df.dropna(inplace=True)


####
# Test data preprocessing
df1 = pd.read_csv('group 12_url1.csv')
df2 = pd.read_csv('group 10_url1.csv')
df1 = pd.concat([df1,df2], ignore_index=True)

# Drop and rename columns
df1.drop(columns=['Start Time', 'End Time', 'Sentence'], inplace=True)
df1.rename(columns={'Corrected Sentence': 'Sentence'}, inplace=True)

# Add another dataset
df2 = pd.read_csv('la_villa_dataset.csv')

# Capitalize column names
df2.columns = [col.capitalize() for col in df1.columns]

# Added synthethic dataset
df3 = pd.read_csv('french_reality_tv_dataset.csv')
df3.rename(columns={'Sentence (French)':'Sentence', 'Translation (English)': 'Translation'},inplace=True)

# Concatanate 
df = pd.concat([df1,df2,df3])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Rename emotions to fit with our labels
df["Emotion"] = df["Emotion"].str.replace("excitement", "happiness")
df["Emotion"] = df["Emotion"].str.replace("confusion", "surprise")
df["Emotion"] = df["Emotion"].str.replace("annoyance", "anger")
df["Emotion"] = df["Emotion"].str.replace("disapproval", "disgust")
df["Emotion"] = df["Emotion"].str.replace("pride", "happiness")
df["Emotion"] = df["Emotion"].str.replace("joy", "happiness")
df["Emotion"] = df["Emotion"].str.replace("disappointment", "sadness")
df["Emotion"] = df["Emotion"].str.replace("optimism", "happiness")
df["Emotion"] = df["Emotion"].str.replace("admiration", "happiness")
df["Emotion"] = df["Emotion"].str.replace("approval", "happiness")
df["Emotion"] = df["Emotion"].str.replace("nervousness", "fear")
df["Emotion"] = df["Emotion"].str.replace("realization", "surprise")
df["Emotion"] = df["Emotion"].str.replace("gratitude", "happiness")
df["Emotion"] = df["Emotion"].str.replace("caring", "happiness")
df["Emotion"] = df["Emotion"].str.replace("love", "happiness")
df["Emotion"] = df["Emotion"].str.replace("remorse", "sadness")
df["Emotion"] = df["Emotion"].str.replace("embarrassment", "fear")
df["Emotion"] = df["Emotion"].str.replace("grief", "sadness")

# Remove unecessary rows that add to the 'happiness' class (there are enough samples)
df_test = df[~df['Emotion'].isin(['curiosity', 'desire', 'relief','amusement'])]

In [None]:
# 5. Processing data for modeling stage
# Encode emotion labels
label_encoder_camembert = LabelEncoder()
df["emotion_label"] = label_encoder_camembert.fit_transform(df["Emotion"])

# Tokenize the sentences using BERT's tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Modify the function to return a dictionary
def encode_sentences(sentences):
    return tokenizer(sentences.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Tokenize the sentences
encoded_inputs = encode_sentences(df['Sentence'])

# Extract input_ids and attention_mask separately
input_ids = encoded_inputs["input_ids"]
attention_mask = encoded_inputs["attention_mask"]

# Now, split data correctly
X_train, X_val, attn_train, attn_val, y_train, y_val = train_test_split(
    input_ids, attention_mask, df['emotion_label'], test_size=0.2, random_state=42
)

# Convert to tensors
train_inputs = torch.tensor(X_train).clone().detach()
val_inputs = torch.tensor(X_val).clone().detach()
train_attn = torch.tensor(attn_train).clone().detach()
val_attn = torch.tensor(attn_val).clone().detach()
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

# Create DataLoader for batch processing
train_data = TensorDataset(train_inputs, train_attn, train_labels)
val_data = TensorDataset(val_inputs, val_attn, val_labels)

train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16)

In [None]:
# 6. Model
# Load the pre-trained CamemBERT model for sequence classification
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(label_encoder_emotion.classes_))

# Setup optimizer
optimizer = AdamW(model.parameters(),lr=2e-5, weight_decay=0.01)

# Move model to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Early Stopping Parameters
patience = 3  # Number of epochs with no improvement before stopping
best_val_loss = float('inf')
epochs_no_improve = 0

num_training_steps = len(train_dataloader) * 10  # Assuming 10 epochs
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Training loop
for epoch in range(15):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/10", unit="batch", leave=False)
    
    for batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
    
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Optimizer
        optimizer.step()

        # Update progress bar
        progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1), refresh=True)

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch  
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
        
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")

    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0  # Reset counter
    else:
        epochs_no_improve += 1  # Increase counter

    if epochs_no_improve >= patience:
        print("Early stopping triggered. Stopping training.")
        break

model.save_pretrained("./camembert_model_6")

In [None]:
# 7. Evaluate model
# Evaluate the model on the validation set
model.eval()
val_preds = []
val_labels_list = []

# Validation loop with tqdm progress bar
progress_bar = tqdm(val_dataloader, desc="Evaluating", unit="batch")
for batch in progress_bar:
    input_ids, attention_mask, labels = batch  
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)

    val_preds.extend(preds.cpu().numpy())
    val_labels_list.extend(labels.cpu().numpy())

# Convert the predictions and true labels back to the original class labels
val_preds_str = label_encoder_camembert.inverse_transform(val_preds)
val_labels_str = label_encoder_camembert.inverse_transform(val_labels_list)

# Calculate accuracy
accuracy = accuracy_score(val_labels_str, val_preds_str)
print(f"Accuracy: {accuracy}")

# Calculate F1 Score (Macro Average)
f1 = f1_score(val_labels_str, val_preds_str, average='macro')
print(f"F1 Score (Macro Average): {f1}")

# Generate classification report
report = classification_report(val_labels_str, val_preds_str, target_names=label_encoder_camembert.classes_)
print("Classification Report:")
print(report)

# Plot confusion matrix
cm = confusion_matrix(val_labels_str, val_preds_str)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=label_encoder_camembert.classes_, yticklabels=label_encoder_camembert.classes_)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Testing on unseen data
# Encode the test labels (you don't need to save the label encoder as you're using it in the same notebook)
df_test["emotion_label"] = label_encoder_camembert.transform(df_test["Emotion"])

# Tokenize the test sentences using the same tokenizer you used for training (CamembertTokenizer)
encoded_test_inputs = encode_sentences(df_test['Sentence'])

# Extract input_ids and attention_mask for the test set
test_input_ids = encoded_test_inputs["input_ids"]
test_attention_mask = encoded_test_inputs["attention_mask"]

# Convert to tensors
test_inputs = torch.tensor(test_input_ids).clone().detach()
test_attn = torch.tensor(test_attention_mask).clone().detach()
test_labels = torch.tensor(df_test['emotion_label'].values)

# Create a DataLoader for batch processing on the test set
test_data = TensorDataset(test_inputs, test_attn, test_labels)
test_dataloader = DataLoader(test_data, batch_size=16)

# The model is already loaded in the notebook, so no need to reload it again
model.eval()  # Set the model to evaluation mode

test_loss = 0
correct_predictions = 0
total_predictions = 0
predicted_labels_list = []  # To store all predictions
true_labels_list = []      # To store all true labels

with torch.no_grad():  # Disable gradient calculation to save memory
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        test_loss += loss.item()

        # Get predicted labels (class with the highest logit value)
        predictions = torch.argmax(logits, dim=-1)

        # Accumulate predictions and true labels
        predicted_labels_list.extend(predictions.cpu().numpy())
        true_labels_list.extend(labels.cpu().numpy())

        # Track the number of correct predictions
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

# Convert lists to numpy arrays for metric calculations
predicted_labels = np.array(predicted_labels_list)
true_labels = np.array(true_labels_list)

# Calculate the average test loss and accuracy
avg_test_loss = test_loss / len(test_dataloader)
accuracy = correct_predictions / total_predictions

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Compute Precision, Recall, F1 Score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
# 6. Machine Translation using

# Load the dataset
sentences_dataset = pd.read_csv('transcribed_data_assemblyAI.csv', sep='\t', engine='python', encoding='latin1')

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
 
# Optimize PyTorch for GPU
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
 
# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)  # Move model to GPU
 
# Warm-up GPU to avoid initial slow batch
dummy_input = torch.tensor([[0]]).to(device)  
 
# Function to translate sentences in batches
def translate(sentences, batch_size=8):  # Reduce batch size if stuck
    translated = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Translating", unit="batch"):
        batch = sentences[i:i+batch_size]
       
        # Tokenize & move input to GPU
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
       
        # Generate translation using GPU
        with torch.no_grad():
            translated_batch = model.generate(**inputs)
       
        # Decode output
        translated.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch])
   
    return translated
 
# Translate sentences from video_1_sentences using GPU
sentences_dataset['Translation'] = translate(sentences_dataset['Sentence'].tolist())

# Display the translated sentences
print(sentences_dataset.head())

# Save the translated sentences to a new CSV file
sentences_dataset.to_csv('translated_data_assemblyAI.csv', sep='|', index=False)
sentences_dataset.to_excel('translated_data_assemblyAI.xlsx', index=False)

In [None]:
# 7. Prompt engineering model