# Pre-Analysis of the audios files

In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from utils.format_helpers import analyze_audio_files, extract_id

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Raw Audio Analysis (Meditation)

In [None]:
directories = ["../data/OBE1","../data/OBE2","../data/Compassion"]
extensions = ["m4a","wav","mp4"]

df = analyze_audio_files(directories, extensions)

# Sort by alphabetical order of experiment names
df = df.sort_values(by="Experiment", ascending=True)
df.to_csv("outputs/audio_data.csv", index=False)
df

In [None]:
# Group by 'Experiment' and sum durations
print(df.groupby('Experiment')['Duration_timedelta'].sum())
print(df['Duration_timedelta'].sum())

In [None]:
# Create a mapping of Filename to Experiment
df_conditon = pd.read_csv('./outputs/overview_interviews.csv')
filename_to_condition = dict(zip(df_conditon['File Name'],df_conditon['Condition']))

df = pd.read_csv("outputs/audio_data.csv")
df['Condition'] = df['File Name'].map(filename_to_condition)
df

In [None]:
# Add a new row for the "All" category
df_all = df.copy()
df_all['Experiment'] = 'All'

# Concatenate the original data with the "All" data
df_combined = pd.concat([df_all, df])
default_palette = sns.color_palette("deep")  
palette = {experiment: (default_palette[0] if experiment != 'All' else default_palette[3]) for experiment in df_combined['Experiment'].unique()}

# Add a new column for duration in minutes
df_combined['Duration_min'] = df_combined['Duration_sec'] / 60

# Plot
plt.figure(figsize=(10, 5))
sns.boxplot(x='Experiment', y='Duration_min', data=df_combined, palette=palette, hue='Experiment')
sns.stripplot(x='Experiment', y='Duration_min', data=df_combined, alpha=0.7, edgecolor='k', color='gray',linewidth=1)

# Set titles and labels
plt.title('Audio Duration by Experiment')
plt.xlabel('Experiments')
plt.ylabel('Duration (minutes)')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(x='Id', hue='Experiment', data=df, palette='Set2')

plt.title('Number of Interviews by Participant')
plt.xlabel('Participant ID')
plt.ylabel('Number of Interviews')
plt.xticks(rotation=45)
plt.legend(title='Experiment')
# Set y-axis to show only integers
plt.yticks(range(0, int(df['Id'].value_counts().max()) + 1))

plt.tight_layout()
#plt.savefig('interviews_by_participant.png', dpi=600)
plt.show()

In [None]:
# Count the number of file name per experiment
df_count = df.groupby('Experiment')['File Name'].nunique().reset_index()
df_count

In [None]:
df_count.sum()

In [None]:
# Count the number of id per experiment
df_count = df.groupby('Experiment')['Id'].nunique().reset_index()
df_count

## Get the order of the conditions for each experiment (Meditation)

Final version, need to be completed & verified manually. 
- Done and actually located at ``interviews_corrected`` folder.

**C** : Control ; **I** : Intervention ; **1** : Only one interview ; **0** : No interview (eg. Set-up)

In [None]:
df = pd.read_csv("outputs/audio_data.csv")
# Compassion
conditions_compassion = pd.read_csv("../data/Compassion/Compassion_orders.csv")
conditions_compassion["Id"] = conditions_compassion["Number"].apply(extract_id)
# If the first word is control -> CI, otherwise -> IC
conditions_compassion["Order_Condition"] = conditions_compassion["order"].apply(lambda x: "CI" if x.split()[0] == "control" else "IC")

# OBE2
conditions_obe2 = pd.read_excel("../Data/OBE2/Order_of_conditions.xlsx")
conditions_obe2["Id"] = conditions_obe2["Identification No."].apply(extract_id)
# CM -> CI, otherwise -> IC
conditions_obe2["Order_Condition"] = conditions_obe2["Order"].apply(lambda x: "CI" if x == "CM" else "IC")

# OBE1
conditions_obe1 = pd.read_csv("../Data/OBE1/order_OBE1.csv")
conditions_obe1["Id"] = conditions_obe1["subjID"]
conditions_obe1["Order_Condition"] = conditions_obe1["order"].apply(lambda x: "CI" if x == "CM" else "IC")

# Combine all conditions
conditions = pd.concat([conditions_compassion, conditions_obe2, conditions_obe1], ignore_index=True)
df = pd.merge(df, conditions[["Id","Order_Condition"]], on="Id", how="left")

# Create the 'Condition' column and set initial values to None
df["Condition"] = None
df = df.sort_values(by=['Id', 'File_name'])

def assign_condition(row):
    # Get rows for the same Id in any experiment
    id_group = df[df['Id'] == row['Id']]
    
    # If there is only one file for this Id, set Condition to "1"
    if len(id_group) == 1:
        return "1"
    
    # Assign 'Condition' based on alphabetical order in 'OBE2' and 'OBE1'
    if row['Experiment'] in ['OBE1', 'OBE2']  and len(id_group) == 2:
        if row.name == id_group.index[0]:
            return row['Order_Condition'][0]
        elif row.name == id_group.index[1]:
            return row['Order_Condition'][1]
    
    # For other cases, leave Condition as None
    return row['Condition']

# Apply the function to assign conditions
df['Condition'] = df.apply(assign_condition, axis=1)

# Select specific columns and save the final structured data
df = df[["Experiment", "File_name", "Id", "Duration", "Order_Condition", "Condition"]]

df.to_csv("outputs/structured_data.csv", index=False)
df

## Raw Audio Analysis (Grief)

In [None]:
directories = ["../data/Grief/eng","../data/Grief/fr"]
extensions = ["m4a","wav","mp4"]

df = analyze_audio_files(directories, extensions)

df.to_csv("outputs/GRIEF_audio_data.csv", index=False)
df

In [None]:
# Group by 'Experiment' and sum durations
print(df.groupby('Experiment')['Duration_timedelta'].sum())
print(df['Duration_timedelta'].sum())

In [None]:
len(df['Id'].unique())

In [None]:
df_summarized = df.groupby(['Id', 'Experiment'], as_index=False)['Duration_sec'].sum()

# Add a new column for duration in minutes
df_summarized['Duration_min'] = df_summarized['Duration_sec'] / 60

# Add a new row for the "All" category
df_summarized_all = df_summarized.copy()
df_summarized_all['Experiment'] = 'All'

# Combine the "All" data with the original data
df_combined = pd.concat([df_summarized_all,df_summarized])

# Plot
plt.figure(figsize=(10, 5))
sns.boxplot(x='Experiment', y='Duration_min', data=df_combined)
sns.stripplot(x='Experiment', y='Duration_min', data=df_combined, alpha=0.7, edgecolor='k', linewidth=1)

# Set titles and labels
plt.title('Audio Duration by Experiment (Summed by Participant ID)')
plt.xlabel('Experiments')
plt.ylabel('Duration (minutes)')

plt.tight_layout()
plt.show()