In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
model_name = 'cardiffnlp/twitter-roberta-base-hate-latest'

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load and prepare data
df = pd.read_csv("data/merged_wo_emojis.csv")
df['labels'] = df['labels'].astype(int)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(
    lambda x: tokenizer(x['clean_text'], padding="max_length", truncation=True, max_length=70), batched=True)

Map:   0%|          | 0/534172 [00:00<?, ? examples/s]

In [3]:
tokenized_dataset

Dataset({
    features: ['text', 'labels', 'word_count', 'clean_text', 'input_ids', 'attention_mask'],
    num_rows: 534172
})

In [4]:
# Remove unused columns
# cols to remove after my bertweet: ['text', 'clean_text', 'word_count', 'token_type_ids']
# cols to remove after cardiffnlp: ['text', 'clean_text', 'word_count']

tokenized_dataset = tokenized_dataset.remove_columns(['text', 'clean_text', 'word_count'])

# Set format for PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [5]:
# Convert to pandas for stratified splitting
df = tokenized_dataset.to_pandas()

# Split while preserving label distribution
train_df, test_df = train_test_split(
    df,
    test_size=10_000,
    stratify=df["labels"],
    random_state=42
)

# Convert back to HF DatasetDict
train_dataset = Dataset.from_pandas(train_df).remove_columns(["__index_level_0__"])
test_dataset = Dataset.from_pandas(test_df).remove_columns(["__index_level_0__"])

tokenized_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [6]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 524172
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [7]:
from sklearn.model_selection import train_test_split

# Convert train_dataset to pandas
train_df = tokenized_dataset["train"].to_pandas()

# Stratified 90% train / 10% val
train_split, val_split = train_test_split(
    train_df, 
    test_size=30_000, 
    stratify=train_df["labels"], 
    random_state=42
)

from datasets import DatasetDict, Dataset

train_dataset = Dataset.from_pandas(train_split).remove_columns(["__index_level_0__"])
val_dataset = Dataset.from_pandas(val_split).remove_columns(["__index_level_0__"])


tokenized_dataset["train"] = train_dataset
tokenized_dataset["validation"] = val_dataset

In [8]:
from collections import Counter

print("Train Label Distribution:", Counter(tokenized_dataset['train']['labels']))
print("Val Label Distribution:", Counter(tokenized_dataset['validation']['labels']))
print("Test Label Distribution:", Counter(tokenized_dataset['test']['labels']))

Train Label Distribution: Counter({0: 331917, 1: 162255})
Val Label Distribution: Counter({0: 20150, 1: 9850})
Test Label Distribution: Counter({0: 6717, 1: 3283})


In [9]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 494172
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
})

In [11]:
tokenized_dataset.save_to_disk("data/tokenized_merged_wo_emojis_cardiffnlp_latest")

Saving the dataset (0/1 shards):   0%|          | 0/494172 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30000 [00:00<?, ? examples/s]

In [12]:
import shutil
shutil.make_archive("data/tokenized_merged_wo_emojis_cardiffnlp_latest", 'zip', "data/tokenized_merged_wo_emojis_cardiffnlp_latest")

'f:\\0_System Folders\\Desktop\\SDP\\3_Coding\\1_Textual\\Working\\data\\tokenized_merged_wo_emojis_cardiffnlp_latest.zip'

In [13]:
# from datasets import load_from_disk

# tokenized_dataset = load_from_disk("data/tokenized_superset_IO")