In [24]:
import pandas as pd

**Dataset**
https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data?select=train.csv

In [53]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
toxicity_individual_annotations = pd.read_csv("toxicity_individual_annotations.csv")

**Merging train data with toxicity_individual_annotations.csv - The individual rater decisions for toxicity questions**

In [32]:
toxicity_individual_annotations_agg = toxicity_individual_annotations.groupby('id').agg(
    total_n_workers=('worker', 'count'),
    toxic_sum=('toxic', 'sum')
).reset_index()
toxicity_individual_annotations_agg['pct_toxic'] =\
    (toxicity_individual_annotations_agg['toxic_sum']/toxicity_individual_annotations_agg['total_n_workers'])

In [87]:
# train[train['comment_text'] == 'The ideology of Islam is in direct conflict with the constitution on many counts. It is unconstitutional to let muslims into the US.']

In [88]:
# toxicity_individual_annotations[toxicity_individual_annotations["id"] == 917315]

In [54]:
train_with_label = train.merge(toxicity_individual_annotations_agg, how='inner', on="id")
# test_with_label = test.merge(toxicity_individual_annotations_agg, how='inner', on="id")

In [63]:
# Check if there are any inconsistency on merge
assert train.shape[0] == train_with_label[~train_with_label['pct_toxic'].isnull()].shape[0]
# assert test.shape[0] == test_with_label[~test_with_label['pct_toxic'].isnull()].shape[0]

**Split on Toxic and Non-Toxic**

In [64]:
train_toxic = train_with_label[train_with_label['pct_toxic'] >= 0.5]

In [66]:
train_non_toxic = train_with_label[train_with_label['pct_toxic'] == 0]

In [84]:
train_toxic.shape[0]

144334

In [85]:
train_non_toxic.shape[0]

1264764

**Train/Val Split**

In [100]:
val_size = 0.2017404076655535
val_n = int(val_size * len(train_toxic))
val_n_non_toxic = int(val_size * len(train_non_toxic))

In [101]:
seed = 0 

In [102]:
# Toxic
val_toxic = train_toxic.sample(n=val_n, random_state=seed)
train_toxic_final = train_toxic.drop(val_toxic.index)

# Non-Toxic
val_non_toxic = train_non_toxic.sample(n=val_n_non_toxic, random_state=seed)
train_non_toxic_final = train_non_toxic.drop(val_non_toxic.index)

In [103]:
assert train_toxic.shape[0] == train_toxic_final.shape[0] + val_toxic.shape[0]
assert train_non_toxic.shape[0] == train_non_toxic_final.shape[0] + val_non_toxic.shape[0]

In [105]:
val_toxic[['comment_text']]

Unnamed: 0,comment_text
39853,"Well, maybe social media will give justice kee..."
835139,You are correct...the world would be better of...
560255,I'd like to see a ban on Trump entering countr...
922719,So typical of republicans. They love to lock p...
146379,Classic Useful idiot of the Red kind! They jus...
...,...
1487884,All you nay-sayers (sp) do you also think that...
962764,".\n\n""The Pope gave Mr. Trump ..... Laudato si..."
1154039,There is not reasoning with someone who sympat...
894440,"Shades of Don Young! Jeez, guess it wasn't rea..."


In [108]:
val_toxic[['comment_text']].to_csv("../detoxifying-text-marco/datasets/val_toxic.csv", index=False)
train_toxic_final[['comment_text']].to_csv("../detoxifying-text-marco/datasets/train_toxic.csv", index=False)

In [109]:
val_non_toxic[['comment_text']].to_csv("../detoxifying-text-marco/datasets/val_non_toxic.csv", index=False)
train_non_toxic_final[['comment_text']].to_csv("../detoxifying-text-marco/datasets/train_non_toxic.csv", index=False)