<h3>Case study</h3>
Find two interesting recordings with at least two annotators and multiple annotations. Compare the temporal and textual annotations, and try to answer the following questions:

In [1]:
# INITIAL SETUP (NEW VERSION)
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# Import from our package
from src.config import config, DatasetType
from src.data.loaders import load_annotations, load_metadata

ModuleNotFoundError: No module named 'src'


<h5>1. Data Loading</h5>

In [None]:
# Try loading data with automatic path resolution
try:
    # Load data for Task 2 (Exploration)
    annotations_df, annotations_emb = load_annotations()
    metadata_df, _, _ = load_metadata()
    
    # Calculate durations
    annotations_df['duration'] = annotations_df['offset'] - annotations_df['onset']
    
    print("Data loaded successfully!")
    print(f"Annotations: {len(annotations_df)} records")
    print(f"Metadata: {len(metadata_df)} records")
    
except FileNotFoundError as e:
    print("ERROR: Could not load dataset")
    print("\nSOLUTION: Please either:")
    print(f"1. Place the dataset in: {config.get_path(DatasetType.EXPLORATION)}")
    print("2. Or set custom path using:")
    print("   config.set_path(DatasetType.EXPLORATION, Path('your/path'))")
    print("3. Or set environment variable:")
    print("   export MLPC_EXPLORATION_PATH='your/path'")
    print("\nDownload from: https://cloud.cp.jku.at/index.php/s/YKJqiWjnQQAjiH5")
    raise

<h5>(a) Identify similarities/differences between annotators</h5>

In [None]:
# Analysis for 185070.mp3
print("Analysis for 185070.mp3:")
display(annotations_df.loc[annotations_df['filename'] == '185070.mp3'])

# Analysis for 637068.mp3 
print("\nAnalysis for 637068.mp3:")
display(annotations_df.loc[annotations_df['filename'] == '637068.mp3'].sort_values(by=['annotator','onset']))

<h5>(b) Compare annotations with metadata</h5>


In [None]:
# 185070.mp3 metadata
print("Metadata for 185070.mp3:")
display(metadata_df.loc[metadata_df['filename'] == '185070.mp3'])

# 637068.mp3 metadata
print("\nMetadata for 637068.mp3:")
display(metadata_df.loc[metadata_df['filename'] == '637068.mp3'])

<h5>(c) Verify annotation guidelines compliance</h5>


<h6>Audio file: 185070.mp3</h6>
Both annotations were done according to the task desctiption, however we can observe that there is a gap between 6's and 9's seconds, which correspornds to the onset and offset respectively, indicates that there is change over time which wasn't mentioned in both annotations. 

<h6>Audio file: 637068.mp3</h6>The students who decribed chosen regions by one-two words doesn't fullfil criteria of the textual annotation such as: descriptor, temporal and context, which makes annotations not clear and ambiguous, and the student who chose more wide region to describe several sounds at the same time  vialoate the criterea for temporal annotation, for some regions sound of "cymbal" appeares multiple times with difference more than one second

<h4>2. Quantitative Analysis</h4>

In [None]:
# Calculate annotation overlaps (identical to original)
overlap_per_file_with_2_annotators = {}
overlap_per_file_with_3_annotators = {}
count_per_zero_overlap = 0
unique_annontator_per_file = []
total_number_files_with_2_annotators = 0
total_number_files_with_3_annotators = 0
precision_threshold = 0.1 # seconds

for file, group in annotations_df.groupby('filename'):
    counts = 0
    counter_for_2_annotators = 0
    counter_for_3_annotators = 0
    
    if group['annotator'].nunique() == 2:
        total_number_files_with_2_annotators += 1
        for (row1,annotator_1), (row2,annotator_2) in combinations(group.iterrows(),2):
            if (annotator_1['annotator'] != annotator_2['annotator']):
                if max(annotator_1['onset'], annotator_2['onset']) < min(annotator_1['offset'], annotator_2['offset']):
                    if abs(annotator_1['duration'] - annotator_2['duration']) <= precision_threshold: # precision threshold of 0.9 seconds
                        counter_for_2_annotators += 1
    elif group['annotator'].nunique() == 3:
        total_number_files_with_3_annotators += 1
        for (row1,annotator_1), (row2,annotator_2),(row3,annotator_3) in combinations(group.iterrows(),3):
            if (annotator_1['annotator'] != annotator_2['annotator']) and (annotator_1['annotator']!= annotator_3['annotator']) and (annotator_2['annotator']!= annotator_3['annotator']):
                if max(annotator_1['onset'], annotator_2['onset'], annotator_3['onset']) < min(annotator_1['offset'], annotator_2['offset'],annotator_3['offset']):
                    if abs(annotator_1['duration'] - annotator_2['duration']) <= precision_threshold and abs(annotator_1['duration'] - annotator_3['duration']) <= precision_threshold and abs(annotator_2['duration'] - annotator_3['duration']) <= precision_threshold:
                        counter_for_3_annotators += 1
    else:
        counts = 0
    overlap_per_file_with_2_annotators[file] = counter_for_2_annotators
    overlap_per_file_with_3_annotators[file] = counter_for_3_annotators
    
    
    if counts == 0:
        count_per_zero_overlap += 1
    unique_annontator_per_file.append(group['annotator'].nunique())

num_of_files_with_overlaps_2_annotators = sum(1 for val in overlap_per_file_with_2_annotators.values() if val > 0)
num_of_files_with_overlaps_3_annotators = sum(1 for val in overlap_per_file_with_3_annotators.values() if val > 0)

file_max_overlap_2_annotators, num_overlap_max_2_annotators = max(overlap_per_file_with_2_annotators.items(), key=lambda x: x[1])
file_max_overlap_3_annotators, num_overlap_max_3_annotators = max(overlap_per_file_with_3_annotators.items(), key=lambda x: x[1])

# Print results (unchanged)
print(f'Total files with 2 annotators: {total_number_files_with_2_annotators}')
print(f'Max overlapping regions: {num_overlap_max_2_annotators} (file: {file_max_overlap_2_annotators})')
print(f'Files with ≥1 overlap (2 annotators): {num_of_files_with_overlaps_2_annotators}')

print(f'\nTotal files with 3 annotators: {total_number_files_with_3_annotators}')
print(f'Max overlapping regions: {num_overlap_max_3_annotators} (file: {file_max_overlap_3_annotators})')
print(f'Files with ≥1 overlap (3 annotators): {num_of_files_with_overlaps_3_annotators}')

<h5>(b) Text annotation similarity</h5>

In [None]:
# Calculate text similarities (identical to original)
similarities_2_annotators = []
similarities_3_annotators = []

for file, group in annotations_df.groupby('filename'):
    if group['annotator'].nunique() == 2:
        for (row1,annotator_1), (row2,annotator_2) in combinations(group.iterrows(),2):
            if (annotator_1['annotator'] != annotator_2['annotator']):
                if max(annotator_1['onset'], annotator_2['onset']) < min(annotator_1['offset'], annotator_2['offset']):
                    if abs(annotator_1['duration'] - annotator_2['duration']) <= 0.9:
                        em1 = annotations_emb[row1].reshape(1, -1)
                        em2 = annotations_emb[row2].reshape(1,-1)
                        sim = cosine_similarity(em1, em2)[0][0]
                        similarities_2_annotators.append(sim)

    elif group['annotator'].nunique() == 3:
        for (row1,annotator_1), (row2,annotator_2),(row3,annotator_3) in combinations(group.iterrows(),3):
            if (annotator_1['annotator'] != annotator_2['annotator']) and (annotator_1['annotator']!= annotator_3['annotator']) and (annotator_2['annotator']!= annotator_3['annotator']):
                if max(annotator_1['onset'], annotator_2['onset'], annotator_3['onset']) < min(annotator_1['offset'], annotator_2['offset'],annotator_3['offset']):
                    if abs(annotator_1['duration'] - annotator_2['duration']) <= 0.9 and abs(annotator_1['duration'] - annotator_3['duration']) <= 0.9 and abs(annotator_2['duration'] - annotator_3['duration']) <= 0.9:
                        em1 = annotations_emb[row1].reshape(1, -1)
                        em2 = annotations_emb[row2].reshape(1,-1)
                        em3 = annotations_emb[row3].reshape(1,-1)
                        sim1 = cosine_similarity(em1, em2)[0][0]
                        sim2 = cosine_similarity(em1, em3)[0][0]
                        sim3 = cosine_similarity(em2, em3)[0][0]
                        avg = (sim1+sim2+sim3) / 3
                        similarities_3_annotators.append(avg)

print(f'\nText similarity results:')
print(f'Pairs compared: {len(similarities_2_annotators)}')
print(f'Mean similarity (2 annotators): {np.mean(similarities_2_annotators):.3f}')
print(f'Mean similarity (3 annotators): {np.mean(similarities_3_annotators):.3f}')