In [2]:
import os
import ffmpeg
import whisper
import subprocess
import torch
import time
from pydub import AudioSegment
# Define paths
source_folder = "C:/Users/benjm/Downloads/episodes/"
target_folder = "C:/Users/benjm/Downloads/exports/"

CUDA is available. Using GPU.


In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

In [3]:
ffmpeg_dir = "C:/Users/Dudo/anaconda3/envs/new-env/Library/bin/ffmpeg.exe"
# Add ffmpeg directory to PATH
os.environ['PATH'] = ffmpeg_dir + os.pathsep + os.environ['PATH']

# Now we can check again to see if ffmpeg.exe is recognized in the PATH
ffmpeg_paths = [directory for directory in os.environ['PATH'].split(os.pathsep) if os.path.isfile(os.path.join(directory, 'ffmpeg.exe'))]

ffmpeg_executable = ffmpeg_paths[0] + '/ffmpeg.exe'

def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}"


## Dataset inconsistency

### Starting from the second episode, end times are incorrect. code below fixes the issue by adjusting the end time based on the start time of the next segment.\
### End of the last segment is filled in by placeholder

In [4]:
structure_df = structure_df.drop_duplicates(subset=['Episode name', 'Start Time (seconds)'])

# Now, let's adjust the end time for each segment from the second episode onwards.
# We'll do this by shifting the start times backwards and filling the last segment's end time with a default value.
# This default value can be a large number to ensure it covers the full length of the last segment, or it can be calculated from the data if available.
# For simplicity, I'll use a placeholder value (assuming the maximum length of an episode is known or estimated).

# Placeholder for the last segment's end time (this should be replaced with the correct value if known)
placeholder_end_time = structure_df['End Time (seconds)'].max()

# Create a new column for adjusted end times
structure_df['Adjusted End Time (seconds)'] = structure_df.groupby('Episode name')['Start Time (seconds)'].shift(-1).fillna(placeholder_end_time).astype(int)

# For the first episode, we retain the original end times
structure_df.loc[structure_df['Episode name'] == 1, 'Adjusted End Time (seconds)'] = structure_df.loc[structure_df['Episode name'] == 1, 'End Time (seconds)']

# Let's take a look at the DataFrame to confirm the changes
structure_df.head()

 1031 1078 1111 1178 1210 1256 1292 1353 1383 1508 1543 1579 1622 1652
 1674 1718 1750 1806 1852 1964 1981 2290 2323 2951 2975 3067 3233 3265
 3301 3385 3420 3452 3496 3663 3706 3750 3813 3915 3963 4038 4121 4151]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  structure_df.loc[structure_df['Episode name'] == 1, 'Adjusted End Time (seconds)'] = structure_df.loc[structure_df['Episode name'] == 1, 'End Time (seconds)']


Unnamed: 0,Instance name,Episode name,Act,Chapter,Segment,Segment title,Segment description,Start Time (seconds),End Time (seconds),SKO Start time,...,Madl (%) V 6+ jr Uitgesteld,Madl (%) V 6+ jr Uitzenddag,Locations,Props,Actors,Emotions,Game Types,Actions,News Types,Adjusted End Time (seconds)
0,Survivor,1,Aparte eilanden,Introductie,Uitleg programma,,,0,124,2022-08-29 19:51:00 +00:00,...,44.150341,25.847947,Eiland landingsproef,Seizoen 22 symbool,"Nicolette Kluijver, Rick Brandsteder, Dennis W...",,,Commentary,,124
67,Survivor,1,Aparte eilanden,Reality,Aankomst,,,124,350,2022-08-29 18:58:00 +00:00,...,35.325617,23.360835,Eiland landingsproef,,"Nicolette Kluijver, London Loy, Niek Roozen, D...","Joy, Surprise, Anticipation",,"Speculation, Diary, Commentary",,350
132,Survivor,1,Aparte eilanden,Introductie,Introductie expeditieleden,,,350,383,2022-08-29 20:03:00 +00:00,...,44.174871,29.460481,"Eiland landingsproef, Thuis",,"Iliass Ojja, Rick Brandsteder (VO)",,,"Biography, Diary, Commentary",,383
194,Survivor,1,Aparte eilanden,Introductie,Introductie expeditieleden,,,383,443,2022-08-29 18:59:00 +00:00,...,35.439873,22.985562,"Eiland landingsproef, Thuis",,"Rose Bertram, Rick Brandsteder (VO)",,,"Biography, Diary, Commentary",,443
255,Survivor,1,Aparte eilanden,Introductie,Introductie expeditieleden,,,443,504,2022-08-29 20:02:00 +00:00,...,44.667424,29.106176,"Eiland landingsproef, Thuis",,"Chatilla van Grinsven, Rick Brandsteder (VO)",,,"Biography, Diary, Commentary",,504


In [6]:
# Make sure the DataFrame is sorted by episode and start time to ensure correct segment order
structure_df.sort_values(by=['Episode name', 'Start Time (seconds)'], inplace=True)

# Reset index after sorting to get a sequential order for segment numbering
structure_df.reset_index(drop=True, inplace=True)

# Start processing from episode 9
start_from_episode = 6

# Filter the DataFrame to include only episodes from 9 onwards
filtered_structure_df = structure_df[structure_df['Episode name'] >= start_from_episode]
# Process each episode
for episode_number in filtered_structure_df['Episode name'].unique():
    input_video_path = os.path.join(source_folder, f"ER22_AFL{episode_number:02d}_MXF.mov")
    full_audio_path = os.path.join(target_folder, f"Episode_{episode_number:02d}_Full.aac")

    # Extract full audio only if it does not already exist
    if not os.path.exists(full_audio_path):
        subprocess.run([ffmpeg_executable, '-i', input_video_path, '-vn', '-acodec', 'aac', '-b:a', '128k', full_audio_path], check=True)

    # Iterate over the segments for the current episode
    episode_segments = filtered_structure_df[structure_df['Episode name'] == episode_number]
    for segment_index, (start_sec, end_sec) in enumerate(zip(episode_segments['Start Time (seconds)'], episode_segments['Adjusted End Time (seconds)']), start=1):
        start_time = format_time(start_sec)
        end_time = format_time(end_sec)
        output_audio_segment_path = os.path.join(target_folder, f"Episode_{episode_number:02d}_Segment_{segment_index:03d}.aac")

        # Extract segment from the full audio
        subprocess.run([ffmpeg_executable, '-i', full_audio_path, '-ss', start_time, '-to', end_time, '-acodec', 'aac', '-b:a', '128k', output_audio_segment_path], check=True)

  episode_segments = filtered_structure_df[structure_df['Episode name'] == episode_number]
  episode_segments = filtered_structure_df[structure_df['Episode name'] == episode_number]
  episode_segments = filtered_structure_df[structure_df['Episode name'] == episode_number]


CalledProcessError: Command '['c:\\Users\\Dudo\\anaconda3\\envs\\new-env\\Library\\bin/ffmpeg.exe', '-i', 'D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/client_data/Expeditie_Robinson_episodes\\ER22_AFL09_MXF.mov', '-vn', '-acodec', 'aac', '-b:a', '128k', 'D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/transcribed_episodes\\Episode_09_Full.aac']' returned non-zero exit status 1.

In [4]:
# Ensure the target folder exists
os.makedirs(target_folder, exist_ok=True)

# Load Whisper model
model = whisper.load_model("medium")  

In [5]:

# Define the path to the audio segments
audio_segments_path = 'D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text'

# Iterate over audio segment files
for file in os.listdir(audio_segments_path):
    if file.endswith(".aac"):
        audio_path = os.path.join(audio_segments_path, file)
        print(f"Transcribing {audio_path}...")

        # Transcribe the audio file
        result = model.transcribe(audio_path, language="Dutch")

        # You can now save or process the transcription text
        transcription_text = result['text']
        

        # For example, save the transcription to a file
        with open(audio_path.replace('.aac', '.txt'), 'w', encoding='utf-8') as f:
            f.write(transcription_text)

Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_001.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_002.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_003.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_004.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_005.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_006.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_007.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_008.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_009.aac...
Transcribing D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text\Episode_08_Segment_010.aac...
Transcribi

In [9]:
from deep_translator import GoogleTranslator
def translate_text(text, source_lang='nl', target_lang='en'):
    try:
        translated_text = GoogleTranslator(source=source_lang, target=target_lang).translate(text)
        return translated_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return text  # Return the original text if translation fails

# Specify the folder containing the text files
text_files_path = 'D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_text'

# List all .aac files in the folder
file_paths = [f for f in os.listdir(text_files_path) if f.endswith('.txt')]

# Initialize an empty list to store data
data = []

# Process each file
for file_path in file_paths:
    # Extract episode and segment numbers from the filename
    parts = file_path.split('_')
    episode = parts[1]
    segment = parts[3].split('.')[0]

    # Read the file content
    with open(os.path.join(text_files_path, file_path), 'r', encoding='utf-8') as file:
        content = file.read()

    # Translate the content (simulated here)
    translated_content = translate_text(content)
    # Append the data to the list
    data.append([translated_content, segment, episode])
    print(f"Translated content for episode {episode}, segment {segment}")
    # Create a DataFrame
df_translated = pd.DataFrame(data, columns=['Translated Content', 'Segment', 'Episode'])

# Example of how to save the DataFrame to a CSV file
df_translated.to_csv('D:/BUAS/Year 2/Block 2C/Block 2C/Episodes/segments_translated/translated_first_half.csv', index=False)


Translated content for episode 01, segment 001
Translated content for episode 01, segment 002
Translated content for episode 01, segment 003
Translated content for episode 01, segment 004
Translated content for episode 01, segment 005
Translated content for episode 01, segment 006
Translated content for episode 01, segment 007
Translated content for episode 01, segment 008
Translated content for episode 01, segment 009
Translated content for episode 01, segment 010
Translated content for episode 01, segment 011
Translated content for episode 01, segment 012
Translated content for episode 01, segment 013
Translated content for episode 01, segment 014
Translated content for episode 01, segment 015
Translated content for episode 01, segment 016
Translated content for episode 01, segment 017
Translated content for episode 01, segment 018
Translated content for episode 01, segment 019
Translated content for episode 01, segment 020
Translated content for episode 01, segment 021
Translated co

In [None]:
from transformers import XLNetForSequenceClassification, XLNetTokenizer
import torch
# Path to your saved model
model_path = 'model_weights/model'
weights_path = 'model_weights/model_weights.pth'

# Load the trained model
model = XLNetForSequenceClassification.from_pretrained(model_path)

# Assuming you're using the same tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# Load the weights into the model
model.load_state_dict(torch.load(weights_path))

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
import pandas as pd
from collections import Counter
import re
import torch
import numpy as np
from transformers import XLNetTokenizer, XLNetForSequenceClassification

emotion_labels = {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'happiness', 4: 'sadness', 5: 'surprise'}

In [None]:
def predict_emotion_maxarg(text):
    cleaned_text = re.sub(r'\b[A-Z]{2,}\b', '', text)
    sentences = re.split(r'(?<=[.!?]) +', cleaned_text)
    sentence_emotions = []

    for sentence in sentences:
        if sentence.strip() != '':
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

            with torch.no_grad():
                outputs = model(**inputs)

            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            predicted_class_index = torch.argmax(probabilities, dim=-1).item()
            predicted_emotion = emotion_labels[predicted_class_index]

            sentence_emotions.append(predicted_emotion)

    if sentence_emotions:
        most_common_emotion = Counter(sentence_emotions).most_common(1)[0][0]
    else:
        most_common_emotion = "unknown"

    return most_common_emotion

In [None]:
def predict_emotion_texlen(text):
    cleaned_text = re.sub(r'\b[A-Z]{2,}\b', '', text)
    sentences = re.split(r'(?<=[.!?]) +', cleaned_text)
    total_length = sum(len(sentence) for sentence in sentences)
    segment_probabilities = np.zeros(len(emotion_labels))

    for sentence in sentences:
        if sentence.strip() != '':
            sentence_length_percentage = len(sentence) / total_length if total_length > 0 else 0
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

            with torch.no_grad():
                outputs = model(**inputs)

            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1).numpy()
            weighted_probabilities = probabilities * sentence_length_percentage
            segment_probabilities += weighted_probabilities.squeeze()

    dominant_emotion_index = np.argmax(segment_probabilities)
    dominant_emotion = emotion_labels[dominant_emotion_index]
    return dominant_emotion

In [None]:
def predict_emotion_texlen_neut(text):
    cleaned_text = re.sub(r'\b[A-Z]{2,}\b', '', text)
    sentences = sent_tokenize(cleaned_text)
    total_length = sum(len(sentence) for sentence in sentences)
    segment_probabilities = np.zeros(len(emotion_labels))
    neutral_sentences = 0

    for sentence in sentences:
        sentiment_score = sia.polarity_scores(sentence)
        if sentiment_score['neu'] > 0.75:  # Adjust the threshold as needed
            neutral_sentences += 1
            continue

        if sentence.strip() != '':
            sentence_length_percentage = len(sentence) / total_length if total_length > 0 else 0
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

            with torch.no_grad():
                outputs = model(**inputs)   

            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1).numpy()
            weighted_probabilities = probabilities * sentence_length_percentage
            segment_probabilities += weighted_probabilities.squeeze()

    # Check if all sentences were neutral, predict for the last one if so
    if np.sum(segment_probabilities) == 0 and neutral_sentences == len(sentences) and len(sentences) > 0:
        sentence = sentences[-1]
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1).numpy()
        segment_probabilities += probabilities.squeeze()

    dominant_emotion_index = np.argmax(segment_probabilities) if np.sum(segment_probabilities) != 0 else "unknown"
    dominant_emotion = emotion_labels.get(dominant_emotion_index, "unknown")
    return dominant_emotion

In [None]:
def predict_emotion_neut(text):
    cleaned_text = re.sub(r'\b[A-Z]{2,}\b', '', text)
    sentences = re.split(r'(?<=[.!?]) +', cleaned_text)
    segment_probabilities = np.zeros(len(emotion_labels))
    sentiment_filtered_sentences = 0

    for sentence in sentences:
        sentiment_score = sia.polarity_scores(sentence)
        # Skipping the sentence if it's neutral, using a threshold for neutrality
        if sentiment_score['neu'] > 0.50:
            sentiment_filtered_sentences += 1
            continue

        if sentence.strip() != '':
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

            with torch.no_grad():
                outputs = model(**inputs)

            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1).numpy()
            segment_probabilities += probabilities.squeeze()

    # If all sentences are filtered out due to neutrality, and it's not the only sentence, return "unknown"
    if len(sentences) > 1 and sentiment_filtered_sentences == len(sentences):
        return "unknown"

    dominant_emotion_index = np.argmax(segment_probabilities)
    dominant_emotion = emotion_labels.get(dominant_emotion_index, "unknown")
    return dominant_emotion


In [None]:
df = pd.concat([pd.read_csv('Data/translated_first_half.csv'), pd.read_csv('Data/translated_second_half.csv')])
df

In [None]:
df = pd.concat([pd.read_csv('Data/translated_first_half.csv'), pd.read_csv('Data/translated_second_half.csv')])

# Add a new column for predicted emotions
df['Emotion'] = df['Translated Content'].apply(predict_emotion_texlen_neut) # Change the function to the one you want to use

# Group by Segment and Episode to aggregate emotions
aggregated_emotions = df.groupby(['Episode', 'Segment'])['Emotion'].agg(lambda x: Counter(x).most_common(1)[0][0])

# Reset index to turn MultiIndex into columns again
aggregated_emotions = aggregated_emotions.reset_index(name='Most Common Emotion')

# Optionally, save the aggregated emotions DataFrame to a new Excel file
output_path = 'path/to/your/output_file.xlsx'
# Uncomment the following line to save to Excel
# aggregated_emotions.to_excel(output_path, index=False)

print("Process completed. The aggregated emotions have been saved to:", output_path)

In [None]:
aggregated_emotions.to_csv('Data/pipeline_emotions.csv', index=False)

In [None]:
val_df = pd.read_csv('Data/Robinson22_structure.csv')

In [None]:
val_df = pd.read_csv('Data/Robinson22_structure.csv')

# Keep only necessary columns and remove duplicates
val_df = val_df.drop_duplicates(subset=['Episode name', 'Start Time (seconds)'])
val_df = val_df[['Episode name', 'Emotions']]

# Initialize the counter and create a new column for the counting segments
val_df['Counted Segment'] = 0
counter = 1
current_episode = val_df.iloc[0]['Episode name']

# Iterate over the rows and assign the counting segment number
for idx, row in val_df.iterrows():
    if row['Episode name'] == current_episode:
        val_df.at[idx, 'Counted Segment'] = counter
        counter += 1
    else:
        current_episode = row['Episode name']
        counter = 1
        val_df.at[idx, 'Counted Segment'] = counter


In [None]:
complex_to_basic_emotion = {
    'Admiration': 'happiness',
    'Amusement': 'happiness',
    'Anger': 'anger',
    'Annoyance': 'anger',
    'Anticipation': 'surprise',
    'Anxiety': 'fear',
    'Approval': 'happiness',
    'Caring': 'happiness',
    'Confusion': 'surprise',
    'Curiosity': 'surprise',
    'Desire': 'happiness',
    'Disappointment': 'sadness',
    'Disapproval': 'disgust',
    'Disgust': 'disgust',
    'Excitement': 'happiness',
    'Fear': 'fear',
    'Grief': 'sadness',
    'Gratitude': 'happiness',
    'Hunger': 'sadness',  # "Hunger" doesn't typically map directly to a basic emotion but can be associated with a longing or lack, thus 'sadness'.
    'Joy': 'happiness',
    'Love': 'happiness',
    'Nervousness': 'fear',
    'Optimism': 'happiness',
    'Pride': 'happiness',
    'Realization': 'surprise',
    'Relief': 'happiness',
    'Remorse': 'sadness',
    'Sadness': 'sadness',
    'Shame': 'disgust',
    'Surprise': 'surprise'
}


def map_emotions(complex_emotions_string):
    # Convert the cell to a string and split on ','
    complex_emotions_list = complex_emotions_string.split(',')

    # Map each of the complex emotions to a basic one
    basic_emotions_list = [complex_to_basic_emotion.get(emotion.strip(), 'unknown') for emotion in complex_emotions_list]
    
    # Count the most frequent basic emotions
    emotion_counter = Counter(basic_emotions_list)
    
    # Remove 'unknown' from the counter if there are other emotions
    if len(emotion_counter) > 1 and 'unknown' in emotion_counter:
        del emotion_counter['unknown']

    # Return the most common emotion or 'unknown' if no emotions are found
    return emotion_counter.most_common(1)[0][0] if emotion_counter else 'unknown'

# Read in your data
# Assuming your DataFrame is named 'val_df' and has been loaded correctly.

# Apply the mapping function to the 'Emotions' column
val_df['Basic Emotion'] = val_df['Emotions'].astype(str).apply(map_emotions)


In [None]:
comb = val_df.merge(aggregated_emotions, left_on=['Episode name', 'Counted Segment'], right_on=['Episode', 'Segment'])

In [None]:
comb.drop(columns=['Episode name', 'Emotions', 'Counted Segment'], inplace=True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(comb['Basic Emotion'], comb['Most Common Emotion'], labels=['happiness', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'unknown'])

# Convert confusion matrix to DataFrame for easier plotting
cm_df = pd.DataFrame(cm,
                     index=['happiness', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'unknown'], 
                     columns=['happiness', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'unknown'])

# Plot the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion Matrix of Emotions')
plt.ylabel('Actual Emotions')
plt.xlabel('Predicted Emotions')
plt.show()