In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from typing import List, Tuple, Dict, Callable

# disable warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Compute avg similarity for each base sample
similarities_avg_individual = {k: {} for k in labels}
for label, dict_with_similarities in data.items():
    for text in dict_with_similarities:
        list_similarities = [x[0] for x in dict_with_similarities[text]]
        similarities_avg_individual[label][text] = sum(list_similarities) / len(
            list_similarities
        )

# Compute avg similarity for each label
similarities_avg_label = {k: [] for k in labels}
similarities_list_label = {k: [] for k in labels}
for label, dict_with_similarities in similarities_avg_individual.items():
    for text, similarity in dict_with_similarities.items():
        similarities_avg_label[label].append(similarity)
        similarities_list_label[label].append(similarity)

    similarities_avg_label[label] = sum(similarities_avg_label[label]) / len(
        similarities_avg_label[label]
    )

# Boxplot with all similarities for all files

# Ten-dim

In [None]:
files = [file for file in os.listdir() if file.startswith('ten-dim')]
labels: List[str] = [
            "social_support",
            "conflict",
            "trust",
            "neutral",
            "fun",
            "respect",
            "knowledge",
            "power",
            "similarity_identity",
        ]

df = pd.DataFrame(columns=["label", "similarity", "dataset"])

for filename in files:
    with open(filename) as f:
        data = json.load(f)

    similarities_concatenated = {k: [] for k in labels}
    for label, dict_with_similarities in data.items():
        for text in dict_with_similarities:
            list_similarities = [x[0] for x in dict_with_similarities[text]]
            similarities_concatenated[label].extend(list_similarities)

    for label, similarities in similarities_concatenated.items():
        df = df.append(pd.DataFrame({"label": [label] * len(similarities), "similarity": similarities, "dataset": [filename.split('_augmented_similarity.json')[0]] * len(similarities)}))

# set figure size
plt.figure(figsize=(15, 15))

sns.boxplot(data=df, x="similarity", y="label", hue="dataset")

plt.tight_layout()
plt.show()

## Sentiment

In [None]:
files = [file for file in os.listdir() if file.startswith('sentiment')]
labels: List[str] = [
            "negative",
            "neutral",
            "positive",
        ]

df = pd.DataFrame(columns=["label", "similarity", "dataset"])

for filename in files:
    with open(filename) as f:
        data = json.load(f)

    similarities_concatenated = {k: [] for k in labels}
    for label, dict_with_similarities in data.items():
        for text in dict_with_similarities:
            list_similarities = [x[0] for x in dict_with_similarities[text]]
            similarities_concatenated[label].extend(list_similarities)

    for label, similarities in similarities_concatenated.items():
        df = df.append(pd.DataFrame({"label": [label] * len(similarities), "similarity": similarities, "dataset": [filename.split('_augmented_similarity.json')[0]] * len(similarities)}))

# set figure size
plt.figure(figsize=(15, 15))

sns.boxplot(data=df, x="similarity", y="label", hue="dataset")

plt.tight_layout()
plt.show()

## Hate-speech

In [None]:
files = [file for file in os.listdir() if file.startswith('hate-speech')]
labels: List[str] = ["NOT", "OFF"]

df = pd.DataFrame(columns=["label", "similarity", "dataset"])

for filename in files:
    with open(filename) as f:
        data = json.load(f)

    similarities_concatenated = {k: [] for k in labels}
    for label, dict_with_similarities in data.items():
        for text in dict_with_similarities:
            list_similarities = [x[0] for x in dict_with_similarities[text]]
            similarities_concatenated[label].extend(list_similarities)

    for label, similarities in similarities_concatenated.items():
        df = df.append(pd.DataFrame({"label": [label] * len(similarities), "similarity": similarities, "dataset": [filename.split('_augmented_similarity.json')[0]] * len(similarities)}))

# set figure size
plt.figure(figsize=(15, 15))

sns.boxplot(data=df, x="similarity", y="label", hue="dataset")

plt.tight_layout()
plt.show()