In [None]:
from pathlib import Path
import string

import pandas as pd
import seaborn as sns
from nltk.tokenize import word_tokenize

In [None]:
sns.set_theme("paper")
plotting_context = sns.plotting_context("paper")

In [None]:
base_path = Path("data/survey")
annotations_df = pd.read_csv(base_path / "annotations.csv")
participants_df = pd.read_csv(base_path / "participants.csv")

In [None]:
participants_df = participants_df[participants_df["passed_instructions"] ]

In [None]:
print(f"Participants signed up: {len(participants_df)}. Actual participants: {annotations_df['participant_id'].nunique()}")

In [None]:
annotations_df.describe()

In [None]:
annotations_per_participant = annotations_df.groupby("participant_id").size()
annotations_per_participant.plot(kind="hist")

In [None]:
print(f"Average number of annotations per participant: Mean {annotations_per_participant.mean():.2} Median {annotations_per_participant.median()} Min {annotations_per_participant.min()} Max {annotations_per_participant.max()}")

In [None]:
print(f"Median annotation duration in seconds {annotations_df['time'].median():.03}")

In [None]:
annotations_df["stimulus_type"].value_counts()

In [None]:
def parse_aspects(aspects):
    return [aspect for aspect in aspects.strip("; ").split("; ")]

for asp in ["aspects1", "aspects2"]:
    annotations_df[asp] = annotations_df[asp].fillna("").apply(parse_aspects)

In [None]:
experiment_aspects = {"temporal_order": "Temporal order", "location": "Recording setting", "number_sources": "Number of sources", "pitch": "Pitch","color_density": "Color & Density", "duration": "Duration", "rhythm": "Rhythm", "loudness": "Loudness", "main_source": "Main source", "usage": "Usage context", "emotion": "Perceived emotion","quality": "Recording quality"}
annotations_df["aspects1_clean"] = annotations_df["aspects1"].apply(lambda x: [aspect for aspect in x if aspect in experiment_aspects])
annotations_df["aspects2_clean"] = annotations_df["aspects2"].apply(lambda x: [aspect for aspect in x if aspect in experiment_aspects])

In [None]:
annotations_df

In [None]:
annotations_df["aspects1_clean"].explode().value_counts()

In [None]:
annotations_df["aspects1"].sample(1).item()

In [None]:
annotations_df["aspects2_clean"].explode().value_counts()

In [None]:
annotations_df.explode("aspects1_clean")[annotations_df["aspects1_clean"].explode() == "location"]

In [None]:
# pie chart of aspects
aspects1_counts = annotations_df["aspects1_clean"].explode().value_counts()
aspects2_counts = annotations_df["aspects2_clean"].explode().value_counts()
aspects_counts = aspects1_counts.add(aspects2_counts, fill_value=0)
aspects_counts = aspects_counts.sort_values(ascending=False)
labels = [experiment_aspects[aspect] for aspect in aspects_counts.index]
ax = aspects_counts.plot(kind="pie", autopct='%1.f%%', colors=sns.color_palette("tab10"), labels=labels)
#tight_layout for saving the figure
ax.figure.tight_layout()
#ax.figure.savefig("aspects_pie.png")
ax.figure.savefig("aspects_pie.pdf")

In [None]:
# aspects1
aspects1_counts.plot(kind="pie", autopct='%1.1f%%', colors=sns.color_palette("tab10"))

In [None]:
# aspects2
aspects2_counts.plot(kind="pie", autopct='%1.1f%%', colors=sns.color_palette("tab10"))

In [None]:
def tokenize_query(query):
    tokens =  word_tokenize(query.lower())
    punctuations = list(string.punctuation)
    tokens = [token for token in tokens if token not in punctuations]
    return tokens

for field in ["query1", "query2"]:
    annotations_df[f"{field}_tokens"] = annotations_df[field].apply(tokenize_query)
    annotations_df[f"{field}_length"] = annotations_df[f"{field}_tokens"].apply(len)

In [None]:
annotations_df["query1_tokens"].explode().value_counts()

In [None]:
def is_refined(query1, query2):
    return query2 != "<<SKIPPED>>" and query1 != query2

annotations_df["refined"] = annotations_df.apply(lambda x: is_refined(x["query1"], x["query2"]), axis=1)

In [None]:
annotations_df["refined"].value_counts() / len(annotations_df)

In [None]:
# percentage of refined queries by stimulus type
annotations_df.groupby("stimulus_type")["refined"].mean() * 100

In [None]:
# plot refine proportion by query1 length
sns.histplot(annotations_df, x="query1_length", hue="refined")

In [None]:
def refined_diff(refined, query1, query2):
    if not refined:
        return 0
    diff = len(query2) - len(query1)
    return diff

annotations_df["refined_longer"] = annotations_df.apply(lambda x: refined_diff(x["refined"], x["query1_tokens"], x["query2_tokens"]), axis=1)
refined_annotations = annotations_df[annotations_df["refined"]]
((refined_annotations["refined_longer"].value_counts() / len(refined_annotations)) * 100).round()

In [None]:
# all shorter refined queries combined
((refined_annotations[refined_annotations["refined_longer"] < 0] ["refined_longer"].value_counts() / len(refined_annotations)) * 100).sum().round()

In [None]:
# longest refined query
annotations_df[annotations_df["refined_longer"] > 0].sort_values("refined_longer", ascending=False).head(1)

In [None]:
# refine without changing query length
refined_annotations[refined_annotations["refined_longer"] == 0]

In [None]:
print(f"Average query length: {annotations_df['query1_length'].mean():.1f} Median {annotations_df['query1_length'].median()}")
print(f"Average refined query length: {refined_annotations['query2_length'].mean():.1f} Median {refined_annotations['query2_length'].median()}")

In [None]:
# average query length by stimulus type
annotations_df.groupby("stimulus_type").agg({"query1_length": ["mean", "median"]})

In [None]:
annotations_df[annotations_df["query2"] != "<<SKIPPED>>"].groupby("stimulus_type").agg({"query2_length": ["mean", "median"]})

In [None]:
sns.boxplot(data=annotations_df, x="query1_length", y="stimulus_type")

In [None]:
# plot relevance distribution
print(f"Relevance mean {annotations_df['result_relevance'].mean():.2} median {annotations_df['result_relevance'].median()}")
sns.catplot(data=annotations_df, x="result_relevance", kind="count", hue="stimulus_type")

In [None]:
# most common tokens per aspect
aspect_tokens = annotations_df.explode("aspects1").explode("query1_tokens").groupby("aspects1")["query1_tokens"].value_counts()
aspect_tokens = aspect_tokens[aspect_tokens > 3]
aspect_tokens