# Introduction

This notebook explores comments variation based on toxicity score. In the Jigsaw Unintended Bias Dataset, a comment toxicity value is basically a probability score calculated as the average annotator perception that the specific comment is toxic.

TODO:
Research questions I hope to answer in this notebook:\
Do comments with toxicity score above 0.5

### Imports

In [1]:
import pandas as pd
import numpy as np
import os

### Data

In [2]:
data_path = '../../data/'
df = pd.read_csv(os.path.join(data_path, 'train.csv')).rename(columns={'target': 'toxicity'})

# Comments

In [3]:
main_indicators = [
    'toxicity', 
    'severe_toxicity', 
    'obscene', 
    'identity_attack', 
    'insult', 
    'threat', 
    'sexual_explicit'
]

main_identities = [
    'male', 
    'female', 
    'homosexual_gay_or_lesbian',
    'christian', 'jewish', 
    'muslim', 
    'white', 
    'black',
    'psychiatric_or_mental_illness'
]

RANDOM_STATE = 42

In [4]:
def save_short_comments_in_toxicity_range(
    df, 
    toxicity_min=0.0, 
    toxicity_max=0.2, 
    n=50, 
    max_len=300, 
    filename="short_comments.txt"
):
    """
    Save n random short comments (length <= max_len) within a specified toxicity range to a text file.
    """
    mask = (
        (df['toxicity'] >= toxicity_min) &
        (df['toxicity'] < toxicity_max) &
        (df['comment_text'].str.len() <= max_len)
    )
    cols = ['id', 'toxicity', 'comment_text'] if 'id' in df.columns else ['toxicity', 'comment_text']
    sample = df.loc[mask, cols].sample(n=min(n, mask.sum()), random_state=RANDOM_STATE)
    
    with open(filename, "w", encoding="utf-8") as f:
        for idx, row in sample.iterrows():
            if 'id' in row:
                f.write(f"\nID: {row['id']}\n")
            f.write(f"Toxicity: {row['toxicity']:.2f}\n")
            f.write("-" * 40 + "\n")
            f.write(row['comment_text'] + "\n")
            f.write("-" * 40 + "\n")

    print(f"Saved {len(sample)} comments to {filename}")

In [5]:
#save_short_comments_in_toxicity_range(df, toxicity_min=0.3, toxicity_max=0.4, n=100, max_len=250, filename="toxicity_0.3_0.4.txt")
#save_short_comments_in_toxicity_range(df, toxicity_min=0.4, toxicity_max=0.5, n=100, max_len=250, filename="toxicity_0.4_0.5.txt")
#save_short_comments_in_toxicity_range(df, toxicity_min=0.5, toxicity_max=0.6, n=100, max_len=250, filename="toxicity_0.5_0.6.txt")