In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import numpy as np
import warnings
from detoxify import Detoxify

pd.set_option('display.max_colwidth', None)
tqdm.pandas()
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
reduced_data = pd.read_csv("/vol/bitbucket/es1519/detecting-hidden-purpose-in-nlp-models/detoxify/war_data/cleaned_data.csv")
reduced_data.info()

In [None]:
with open('sentiment_results.json') as f:
    results = json.load(f)
print(f"{len(results)} entries in JSON file")

In [None]:
label_scores = {label: [] for label in candidate_labels}
for scores in results.values():
    for l, s in scores.items():
        label_scores[l].append(s)

In [None]:
def plot_histogram(label_scores):
    fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))
    axs = axs.flatten()
    final_plot = len(label_scores)

    for i, (label, scores) in enumerate(label_scores.items()):
        axs[i].hist(scores, bins=50, range=(0, 1))
        axs[i].set_xlabel('Value')
        axs[i].set_ylabel('Frequency')
        axs[i].set_title(f'Histogram for "{label}"')
        axs[i].set_yscale('log')

    all_scores = [value for values in label_scores.values() for value in values]
    axs[final_plot].hist(all_scores, bins=50, range=(0, 1))
    axs[final_plot].set_xlabel('Value')
    axs[final_plot].set_ylabel('Frequency')
    axs[final_plot].set_title("Histogram for All Prompts")
    axs[final_plot].set_yscale('log')

    plt.tight_layout()
    plt.show()

    counts, bins = np.histogram(all_scores, bins=10, range=(0, 1))

    for i, count in enumerate(counts):
        lower_bound = bins[i]
        upper_bound = bins[i+1]
        print(f"{lower_bound:.2f} - {upper_bound:.2f}: {count}")


In [None]:
label_scores = dict(sorted(label_scores.items()))
plot_histogram(label_scores)

In [None]:
toxify = Detoxify('original')
toxicity_scores = {
    'toxicity': [],
    'severe_toxicity': [],
    'obscene': [],
    'threat': [],
    'insult': [],
    'identity_attack': [],
}
for tweet in tqdm(results.keys()):
    result = toxify.predict(tweet)
    for label, score in result.items():
        toxicity_scores[label].append(score)

In [None]:
plot_histogram(toxicity_scores)


In [None]:
tweets_per_threshold = {
    0.6: 0,
    0.7: 0,
    0.75: 0,
    0.8: 0,
    0.9: 0,
    0.95: 0,
}
for tweet, scores in results.items():
    for threshold in tweets_per_threshold.keys():
        if any(score > threshold for score in scores.values()):
            tweets_per_threshold[threshold] += 1

for threshold, count in tweets_per_threshold.items():
    print(f"{threshold}: {count} tweets ({round(count / len(results) * 100, 2)}%)")