Load the Dataset: Load the .pkl file containing your dataset into a pandas DataFrame or any appropriate data structure in Python.

In [1]:
from tokenizers import BertWordPieceTokenizer
import pandas as pd

In [2]:
df = pd.read_pickle('../preprocessed/tweets_bert.pkl')
df = df.drop(columns=['tokens'])

df.head()

Unnamed: 0,tweet,label
0,as a woman you shouldnt complain about cleanin...,0
1,boy dats coldtyga dwn bad for cuffin dat hoe i...,0
2,dawg you ever fuck a bitch and she sta to cry ...,0
3,she look like a tranny,0
4,the shit you hear about me might be true or it...,0


Pre-processing: Perform any necessary pre-processing steps on the tweets, such as cleaning, tokenization, and normalization. You may also need to handle any missing values or outliers in the dataset.

In [3]:
# Write tweets to a text file
with open("../preprocessed/bert_formatted_data.txt", "w", encoding="utf-8") as file:
    for tweet in df["tweet"]:
        file.write(tweet + "\n")

Train Custom BERT Tokenizer: Use the pre-processed dataset to train a custom BERT tokenizer specifically tailored for your hate speech classification task. This tokenizer will convert the tweets into BERT-compatible tokens.

In [4]:
import os
from tokenizers import BertWordPieceTokenizer

# Initialize the tokenizer with desired parameters
tokenizer = BertWordPieceTokenizer(
    clean_text=True,  # Clean text before tokenization
    handle_chinese_chars=True,
    strip_accents=False,
    lowercase=False,  # Keep case-sensitive
    wordpieces_prefix="##"
)

# Train the tokenizer
tokenizer.train(
    files=["../preprocessed/bert_formatted_data.txt"],  # Path to formatted data
    vocab_size=30000,  # Vocabulary size
    min_frequency=2,   # Minimum frequency to include a token in vocabulary
    limit_alphabet=1000,  # Limit alphabet characters during training
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]  # Special tokens to include
)

save_dir = "..\\custom_bert_tokenizer"
os.makedirs(save_dir, exist_ok=True)

# Save the trained tokenizer
tokenizer.save_model(save_dir)

['..\\custom_bert_tokenizer\\vocab.txt']

Load the custom tokenizer to ensure it correctly tokenizes the tweets into BERT-compatible tokens.

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("..\\custom_bert_tokenizer")

# Test the tokenizer on a sample tweet
sample_tweet = "This is a sample tweet for testing."
tokenized_input = tokenizer.tokenize(sample_tweet)
print("Tokenized input:", tokenized_input)

Tokenized input: ['this', 'is', 'a', 'sample', 'tweet', 'for', 'testing', '[UNK]']


Convert the pre-processed text data (tweets) into tokens using the custom tokenizer.

In [6]:
from transformers import BertTokenizer
import pandas as pd

tokenizer = BertTokenizer.from_pretrained("..\\custom_bert_tokenizer")

# Tokenization function
def tokenize_text(text):
    return tokenizer.tokenize(text)

df["tokenized_text"] = df["tweet"].apply(tokenize_text)

# Save tokenized data
df.to_csv("..\\tokenized_dataset\\tokenized_data.csv", index=False)  

Define the parameters for the custom BERT model, including the vocabulary size, hidden size, number of layers, attention heads, and maximum position embeddings.

In [7]:
vocab_file_path = "../custom_bert_tokenizer/vocab.txt"

# Read the vocabulary file and count the number of lines (tokens)
with open(vocab_file_path, "r", encoding="utf-8") as file:
    # Read all lines and count them
    vocab_size = sum(1 for _ in file)

print("Vocabulary size:", vocab_size)


Vocabulary size: 15537


In [8]:
from transformers import BertConfig

# Define model parameters
hidden_size = 768  # Typically, BERT models use a hidden size of 768
num_hidden_layers = 12  # Number of transformer layers in the model
num_attention_heads = 12  # Number of attention heads in each layer
max_position_embeddings = 512  # Maximum position embeddings supported by the model

# Create a configuration object
config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    max_position_embeddings=max_position_embeddings
)

# Print the configuration to verify
print(config)


BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 15537
}



Define Training Arguments: Specify the training arguments for training the custom BERT model, such as the output directory, number of epochs, batch size, and evaluation settings.

In [9]:
from transformers import Trainer, TrainingArguments

# Define the output directory where the trained model will be saved
output_dir = "..\\custom_bert_model"

# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,              # Output directory
    num_train_epochs=3,                 # Number of training epochs
    per_device_train_batch_size=8,      # Batch size per device during training
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=100,                  # Log every N steps
    evaluation_strategy="steps",        # Evaluation strategy during training
    eval_steps=500,                     # Run evaluation every N steps
    save_steps=500,                     # Save model checkpoint every N steps
    save_total_limit=2,                 # Limit the total number of saved checkpoints
    gradient_accumulation_steps=1,      # Number of gradient accumulation steps
    disable_tqdm=False,                 # Disable tqdm progress bar
    load_best_model_at_end=True,        # Load the best model at the end of training
    metric_for_best_model="accuracy",   # Metric to use for selecting the best model
    greater_is_better=True,             # Whether the best metric is higher or lower
    report_to="wandb"                   # Integration with Weights & Biases (optional)
)

# Print the defined training arguments
print(training_args)


TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=500,
eval_strategy=IntervalStrategy.STEPS,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=



Train the BERT Model: Train the custom BERT model using the specified training arguments and input data (tweet tokens).

In [18]:
from transformers import PreTrainedTokenizerFast

# Initialize the custom tokenizer
tokenizer = BertTokenizer.from_pretrained("..\\custom_bert_tokenizer")

# Function to convert tokenized text to input IDs and attention masks
def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True, max_length=128)



In [21]:
# Convert DataFrame to Hugging Face Dataset
from datasets import Dataset, load_dataset

dataset = load_dataset('csv', data_files='..\\tokenized_dataset\\tokenized_data.csv')
print(dataset)

# Map the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["tweet", "tokenized_text"])
# print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'tokenized_text'],
        num_rows: 24783
    })
})


In [10]:
## Define model configuration
from transformers import BertConfig, BertForSequenceClassification

config = BertConfig(
    vocab_size=vocab_size,  # obtained from the vocab file earlier
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    max_position_embeddings=512
)

# Initialize the model
model = BertForSequenceClassification(config)


In [22]:
## prepare tokenized data
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict

# Example: Assuming tokenized_tweets is a list of tokenized tweets and labels is a list of corresponding labels
# tokenized_tweets = [{"input_ids": [..], "attention_mask": [..]}] 
# labels = [0, 1, 0, ...]
data = pd.read_csv("..\\tokenized_dataset\\tokenized_data.csv")
dataset = Dataset.from_pandas(data)
# dataset = dataset.map(lambda x: {"input_ids": eval(x["input_ids"]), "attention_mask": eval(x["attention_mask"]), "labels": int(x["labels"])})
# dataset = dataset.train_test_split(test_size=0.2)

# data_collator = DataCollatorWithPadding(tokenizer)


In [None]:
## define training arguments

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False  # Show progress bar
)


Evaluate the Model: Evaluate the performance of the trained model on a validation or test dataset to assess its accuracy and effectiveness in classifying hate speech.

Model Inference: Once the model is trained and evaluated, you can use it to make predictions on new tweets to determine whether they contain hate speech or not.