In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from json_repair import repair_json
import matplotlib.pyplot as plt
import seaborn as sns
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

# Variant 1 extraction

In [None]:
data_dir = Path("laions_got_talent_enhanced_flash_annotations_and_long_captions")

In [None]:
vad_scores_to_emotion = pd.read_csv("original_emotion_to_vad_scores.csv")
vad_scores_to_emotion.loc[-1] = ["Neutral", 
0.0, vad_scores_to_emotion["pleasure_std"].quantile(0.5), 
0.0, vad_scores_to_emotion["arousal_std"].quantile(0.5), 
0.0, vad_scores_to_emotion["dominance_std"].quantile(0.5)]  # adding a row at the end
vad_scores_to_emotion.index = vad_scores_to_emotion.index + 1
vad_scores_to_emotion = vad_scores_to_emotion.sort_index()

## Extract emotions

In [None]:
choosen_files = []
for file in data_dir.glob("*.tar"):
    if "and" in file.name:
        choosen_files.append(file.name)

In [None]:
desc = {"file": [], "language": [], "emotions": []}
for file in choosen_files:
    language = file.split("_")[0]
    description = "_".join(file.split("intense")[1:])
    if "vocalbursts" in description:
        description = "".join(description.split("vocalbursts")[0])
    description = description.split("and")
    emotions = [emotion.replace("_"," ").strip() for emotion in description if emotion.strip()]
    desc["file"].append(file)
    desc["language"].append(language)
    desc["emotions"].append(emotions)

In [None]:
choosen_files = [file.replace(".tar","") for file in choosen_files if file in answers]

In [None]:
base_df = pd.DataFrame(desc)
base_df["emotions"] = base_df["emotions"].apply(lambda x: x.strip())
base_df[["pleasure_mean", "pleasure_std", "arousal_mean", "arousal_std", "dominance_mean", "dominance_std"]] = np.nan 
base_df.head()

In [None]:
vad_scores_to_emotion = pd.read_csv("original_emotion_to_vad_scores.csv")
vad_scores_to_emotion.head()

## Set up the LLM

In [None]:
base_url = "http://localhost:11434"

In [None]:
model = OllamaLLM(base_url=base_url, model="gemma3:27b", num_ctx=8192*4) # ~18Gb of VRAM

In [None]:
template = """{system_prompt}
    
{text}
"""

In [None]:
system_prompt = f"""You are an assistant that helps summarize emotional labels into a single, most representative term.
The user will provide a list of words or phrases, separated by commas. Your task is to carefully analyze the list and respond with one of the allowed labels that best captures the shared meaning or emotional essence of the entire group. Choose the most inclusive and central term.

The allowed labels you must use for answer are the following:
{'\n'.join(vad_scores_to_emotion['emotion'].tolist())}
Respond with a only a single word — no explanations or additional text."""

## Run annotation

In [None]:
answers_sum = {}

In [None]:
for i, line in enumerate(base_df["emotions"].unique()):
    if line in answers_sum:
        continue
    response  = chain.invoke(
        {
            "system_prompt": system_prompt,
            "text": line
        }
    )
    answers_sum[line] = response

## Apply answers to the dataframe

In [None]:
# Add as new column
base_df["summarized_emotion"] = base_df["emotions"].map(answers_sum)
base_df["summarized_emotion"] = base_df["summarized_emotion"].apply(lambda x: x.strip())
base_df["summarized_emotion"] = base_df["summarized_emotion"].apply(lambda x: x.replace("*",""))

In [None]:
# Find summarized_emotion that not in vad_scores_to_emotion.emotion
not_in_emotion_to_vad = base_df[~base_df["summarized_emotion"].isin(vad_scores_to_emotion["emotion"])]
print(not_in_emotion_to_vad["summarized_emotion"].unique())

## Retrieve VAD scores for summarized emotions

In [None]:
# Update pleasure_mean, pleasure_std, arousal_mean, arousal_std, dominance_mean, dominance_std baced on vad_scores_to_emotion
def update_vad_scores(row):
    emotion = row["summarized_emotion"]
    if emotion in vad_scores_to_emotion["emotion"].values:
        vad_row = vad_scores_to_emotion[vad_scores_to_emotion["emotion"] == emotion].iloc[0]
        row["pleasure_mean"] = vad_row["pleasure_mean"]
        row["pleasure_std"] = vad_row["pleasure_std"]
        row["arousal_mean"] = vad_row["arousal_mean"]
        row["arousal_std"] = vad_row["arousal_std"]
        row["dominance_mean"] = vad_row["dominance_mean"]
        row["dominance_std"] = vad_row["dominance_std"]
    return row


In [None]:
base_df = base_df.apply(update_vad_scores, axis=1)

In [None]:
base_df.to_csv(data_dir / "final_annotations_with_summarized_emotions.csv", sep=";", index=False)

## Applying scores to audio files in folders

In [None]:
audio_dir = data_dir / "extracted_audio"

In [None]:
# mp3_files.txt is a listing of all mp3 files in all subfolders of audio_dir
with open(audio_dir / "mp3_files.txt", mode="r") as f:
    mp3_files = f.readlines()
mp3_files = [file.strip() for file in mp3_files if file.strip()]

In [None]:
audio_df = pd.DataFrame(mp3_files, columns=["file"])

In [None]:
audio_df["parent_dir"] = audio_df["file"].apply(lambda x: "/".join(x.split("/")[:-1])) 

In [None]:
audio_df = audio_df.merge(base_df, left_on="parent_dir", right_on="file", how="left")

In [None]:
# Drop unnecessary columns
audio_df = audio_df.drop(columns=["file_y", "parent_dir"])

In [None]:
audio_df = audio_df.rename(columns={"file_x": "file"})
audio_df = audio_df.rename(columns={"summarized_emotion": "verified_emotion"})

In [None]:
audio_df["full_path"] = audio_df["file"].apply(lambda x: str(audio_dir / x))

## Save result

In [None]:
audio_df.to_csv(audio_dir / "final_audio_annotations_with_summarized_emotion.csv", index=False, sep=";")

# Variant 2

In [None]:
import json

In [None]:
# Cut substring starting from '#' to '<'
def cut_substring(text, start_str, end_str):
    start_index = text.find(start_str)
    if start_index == -1:
        return ""  # Return original text if start_str is not found
    end_index = text.find(end_str, start_index)
    if end_index == -1:
        return ""  # Return text up to start_str if end_str is not found
    return text[start_index+1:end_index]

In [None]:
end_markers = ["</Va", "</Ar", "</Su"]

## Run emotion extracion

In [None]:
jsons = {
    "file": [], #Must be without extension
    "valence": [],
    "arousal": [],
    "dominance": []
}
for directory in (data_dir / "extracted_audio").glob("*/"):
    for file in directory.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)
            lines = []
            if "annotation" not in data:
                continue
            for end_marker in end_markers:
                for line in data["annotation"].splitlines():
                    if end_marker in line:
                        line = cut_substring(line, "#", end_marker)
                        if line == "":
                            print(f"Empty line found in file {file.name} for end marker {end_marker}. Skipping.")
                            continue
                        lines.append(line.strip())
        if len(lines) != 3:
            print(f"Expected 3 lines in {file.name}, got {len(lines)} lines {lines}")
            continue
        jsons["file"].append(file.parent / file.stem)  # Store file path without extension
        jsons["valence"].append(lines[0])
        jsons["arousal"].append(lines[1])
        jsons["dominance"].append(lines[2])    
df = pd.DataFrame(jsons)
    

## Save result

In [None]:
df.to_csv(data_dir / "extracted_audio" /"vad_descriptions.json", index=False, sep=";")

## Set up LLM

In [None]:
model = OllamaLLM(base_url=f"http://172.22.52.107:11434", model="gemma3:27b", num_ctx=8192)

In [None]:
template = """{system_prompt}
    
{text}
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model

In [None]:
system_prompt = """You are helpful AI assistant. You will be provided with short description of the emotion expressed. Your task is to determine the strength and direction of the speaker's emotion.
The allowed descriptions and scores are:
- Extremely negative: -1
- Very negative: -0.75
- Weak: -0.5
- Slightly negative: -0.25
- Neutral: 0
- Slightly positive: 0.25
- Strong: 0.5
- Very positive: 0.75
- Extremely positive: 1

All generalizations must be made from speaker's perspective, not the subject's.

The description will be provided in the following format:
Emotion: arousal OR valence OR dominance
Description: <description>

Please, return only the emotion score without any additional text or explanation."""

In [None]:
df["short_description_valence"] = None
df["short_description_arousal"] = None
df["short_description_dominance"] = None


In [None]:
valence_unique = df["valence"].unique()
arousal_unique = df["arousal"].unique()
dominance_unique = df["dominance"].unique()

In [None]:
swap_valence = {}
swap_arousal = {}
swap_dominance = {}

## Run annotation

In [None]:
for swap_dict, uniq_vals, dict_emotion in zip([swap_valence, swap_arousal, swap_dominance], [valence_unique, arousal_unique, dominance_unique], ["valence", "arousal", "dominance"]):
    for i in range(0, len(uniq_vals), 32):
        batch = uniq_vals[i:i+32].tolist()
        
        text = f"Emotion: {dict_emotion}\nDescription: "
        response = chain.batch(
            [
                {
                    "system_prompt": system_prompt,
                    "text": text + desc
                } for desc in batch

            ])
        for j, desc in enumerate(response):
            swap_dict[batch[j]] = desc.strip()

## Apply annotation result

In [None]:
for swap_dict, origina_column, new_column in zip([swap_valence, swap_arousal, swap_dominance], ["valence","arousal","dominance"] ,["short_description_valence", "short_description_arousal", "short_description_dominance"]):
    df[new_column] = df[origina_column].apply(lambda x: swap_dict[x] if x in swap_dict else x)

In [None]:
df.short_description_valence.value_counts()

In [None]:
df.short_description_arousal.value_counts()

In [None]:
df.short_description_dominance.value_counts()

In [None]:
# Change to float
df["short_description_arousal"] = df.short_description_arousal.astype(float)
df["short_description_valence"] = df.short_description_valence.astype(float)
df["short_description_dominance"] = df.short_description_dominance.astype(float)

In [None]:
# Save the DataFrame to a CSV file
# But only with the necessary columns
output_file = data_dir / "vad_descriptions.csv"
df_to_save = df.drop(columns=["valence", "arousal", "dominance"])
# Rename columns to match the original ones
df_to_save = df_to_save.rename(columns={
    "short_description_valence": "valence",
    "short_description_arousal": "arousal",
    "short_description_dominance": "dominance"
})
df_to_save.to_csv(output_file, index=False, sep=";")

## Resampling

In [None]:
# Reload the CSV file to ensure it is saved correctly
df = pd.read_csv(output_file, sep=";")

### Optional step - resampole to new range

In [None]:
def resample_series(series, new_min=-0.7, new_max=0.7):
    old_min = series.min()
    old_max = series.max()
    return ((series - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

In [None]:

df["valence"] = resample_series(
                df["valence"],
                new_min=vad_scores_to_emotion.pleasure_mean.min(),
                new_max=vad_scores_to_emotion.pleasure_mean.max())
df["arousal"] = resample_series(
                df["arousal"], 
                new_min=vad_scores_to_emotion.arousal_mean.min(), 
                new_max=vad_scores_to_emotion.arousal_mean.max())
df["dominance"] = resample_series(
                df["dominance"], 
                new_min=vad_scores_to_emotion.dominance_mean.min(), 
                new_max=vad_scores_to_emotion.dominance_mean.max())

## Apply resampling

In [None]:
def apply_resampling(value, stds_mean, stds_std):
    if value == -1:
        mean = -0.85
    elif value == 1:
        mean = 0.85
    else:
        mean = value
    std = np.random.normal(stds_mean, stds_std/3)
    # Std should not be negative
    while std < 0.1 or std > 0.5:
        std = np.random.normal(stds_mean, stds_std/3)
    mean_gen = np.random.normal(mean, std)
    scaling_factor = 1.0
    # Mean should be in the range -1 < mean < 1
    while mean_gen <= -1 or mean_gen >= 1:
        scaling_factor += 0.1
        if scaling_factor > 10:
            raise ValueError("Cannot find suitable mean value in the range -1 < mean < 1, Mean: {}, Std: {}".format(mean, std))
        mean_gen = np.random.normal(mean, std/scaling_factor)
    return mean_gen, std

def resample_vad_scores(df, emotion_df):
    resampled_scores = []
    pleasure_mean_std = df["valence"].apply(apply_resampling, args=(emotion_df["pleasure_std"].mean(), emotion_df["pleasure_std"].mean())) # result is tuple!
    arousal_mean_std = df["arousal"].apply(apply_resampling, args=(emotion_df["arousal_std"].mean(), emotion_df["arousal_std"].mean()))
    dominance_mean_std = df["dominance"].apply(apply_resampling, args=(emotion_df["dominance_std"].mean(), emotion_df["dominance_std"].mean()))
    for (pleasure, pleasure_std), (arousal, arousal_std), (dominance, dominance_std) in zip(pleasure_mean_std, arousal_mean_std, dominance_mean_std):
        resampled_scores.append({
            "pleasure_mean": pleasure,
            "pleasure_std": pleasure_std,
            "arousal_mean": arousal,
            "arousal_std": arousal_std,
            "dominance_mean": dominance,
            "dominance_std": dominance_std
        })
    resampled_df = pd.DataFrame(resampled_scores)
    return resampled_df

In [None]:
resampled_df = resample_vad_scores(df, vad_scores_to_emotion)

## Plot distributions

In [None]:
# Plot the distribution of means and stds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(resampled_df["pleasure_mean"], kde=True, bins=30)
plt.title("Distribution of Pleasure Means")
plt.subplot(1, 2, 2)
sns.histplot(resampled_df["pleasure_std"], kde=True, bins=30)
plt.title("Distribution of Pleasure Stds")
plt.tight_layout()
plt.show()

In [None]:
# Plot the distribution of means and stds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(resampled_df["arousal_mean"], kde=True, bins=30)
plt.title("Distribution of Arousal Means")
plt.subplot(1, 2, 2)
sns.histplot(resampled_df["arousal_std"], kde=True, bins=30)
plt.title("Distribution of Arousal Stds")
plt.tight_layout()
plt.show()

In [None]:
# Plot the distribution of means and stds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(resampled_df["dominance_mean"], kde=True, bins=30)
plt.title("Distribution of Dominance Means")
plt.subplot(1, 2, 2)
sns.histplot(resampled_df["Dominance_std"], kde=True, bins=30)
plt.title("Distribution of Dominance Stds")
plt.tight_layout()
plt.show()

## Merge result

In [None]:
df = pd.concat([df, resampled_df], axis=1)

In [None]:
def find_closest_emotion(v, a, d, df):
    min_distance = float('inf')
    closest_emotion = None

    distance = np.sqrt((v - df["pleasure_mean"]) ** 2 + (a - df["arousal_mean"]) ** 2 + (d - df["dominance_mean"]) ** 2)
    
    min_index = distance.idxmin()
    closest_emotion = df.iloc[min_index]["emotion"]
    
    return closest_emotion

In [None]:
stat = []
for i in range(vad_scores_to_emotion.shape[0]):
    vals = df[["pleasure_mean", "arousal_mean", "dominance_mean"]].values - vad_scores_to_emotion[["pleasure_mean", "arousal_mean", "dominance_mean"]].values[i]
    vals = np.sqrt(np.sum(vals**2, axis=1))
    stat.append(vals)

In [None]:
stat = np.array(stat).T

In [None]:
verified_emotions = [
    vad_scores_to_emotion["emotion"].values[idx] for idx in np.argmin(stat, axis=1)
]
df["verified_emotion"] = verified_emotions

In [None]:
# Plot distribution of verified emotions 
plt.figure(figsize=(6, 12))
#Use horizontal bar plot
sns.countplot(data=df_to_save, y="verified_emotion", order=df["verified_emotion"].value_counts().index)
plt.title("Distribution of Verified Emotions")
plt.xlabel("Count")
plt.ylabel("Verified Emotion")
plt.tight_layout()
plt.show()

## Save result

In [None]:
# Save only relevant columns
df_to_save = df[["file","pleasure_mean", "pleasure_std", "arousal_mean", "arousal_std", "dominance_mean", "dominance_std", "verified_emotion"]]
# Rename file column to full_path
df_to_save = df_to_save.rename(columns={
    "file": "full_path"})

In [None]:
output_file = data_dir / "extracted_audio" /"vad_descriptions_resampled.csv"
df_to_save.to_csv(output_file, index=False, sep=";")