In [2]:
pip install openprompt

Note: you may need to restart the kernel to use updated packages.


In [1]:
from openprompt.plms import load_plm
from openprompt.prompts.prompt_generator import LMBFFTemplateGenerationTemplate
from openprompt.pipeline_base import PromptDataLoader, PromptForClassification
from openprompt.prompts import ManualVerbalizer, ManualTemplate
from openprompt.trainer import ClassificationRunner
from openprompt.data_utils import InputExample
import copy
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Load the dataset
df = pd.read_csv("HateSpeechDatasetBalanced.csv")

# Extract the needed columns
df = df[['Content', 'Label']]

df.head()

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


In [3]:
df['Label'].value_counts()

Label
1    364526
0    361597
Name: count, dtype: int64

In [4]:
df['Content'].str.len().describe()

count    726123.000000
mean        196.846960
std         247.350801
min           1.000000
25%          59.000000
50%         109.000000
75%         228.000000
max        9998.000000
Name: Content, dtype: float64

In [5]:
df = df[df['Content'].str.len() < 128]
df

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1
...,...,...
726110,banning bare breasts ha gayyyyyy,1
726119,I'm gonna kill you,1
726120,Someone is better than us,0
726121,someone is really,0


In [6]:
from sklearn.model_selection import train_test_split

# Split DataFrame
train_df, valid_df = train_test_split(df, test_size=0.2)  # 80% train, 20% validation

# Subset, this can be adjust
train_df = train_df.iloc[:3600, :]
valid_df = valid_df.iloc[:3600, :]

### Step 3: Initialize RoBERTa Tokenizer

the roberta is better than bert!

In [7]:
# Load the PLM, tokenizer, and wrapper class
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "roberta-large")

In [8]:
hateful_df = df[df['Label'] == 1]
non_hateful_df = df[df['Label'] == 0]

print("Non-Hateful Data size:", non_hateful_df.shape[0])
print("Hateful Data size:", hateful_df.shape[0])

Non-Hateful Data size: 178229
Hateful Data size: 233893


In [9]:
# number of demonstrations
num_demonstrations = 1  # try different number

demonstrations = []

for _ in range(num_demonstrations):
    # Random choice training set example with label 0 (non-hateful)
    random_example_1 = non_hateful_df.sample(1).iloc[0]['Content']
    # Random choice training set example with label 1 (hateful)
    random_example_2 = hateful_df.sample(1).iloc[0]['Content']

    demonstration = f'{random_example_1} Sentence for analyze: This sentence is positive.'\
                    f' '\
                    f'{random_example_2} Sentence for analyze: This sentence is negative.'
    demonstrations.append(demonstration)

In [10]:
template = ManualTemplate(
    tokenizer=tokenizer, 
    text='{"placeholder":"text_a"} Sentence for analyze: This sentence is {"mask"}.' 
        + ' '.join(demonstrations)
)

In [11]:
verbalizer = ManualVerbalizer(
    tokenizer=tokenizer, 
    num_classes=2, 
    label_words=['positive','negative']
)

In [12]:
# Access the first data point
first_data_point = train_df.iloc[0]
text = first_data_point['Content']
label = int(first_data_point['Label'])  # Convert label to int if it's in string format

# Create an InputExample object
sample_example = InputExample(text_a=text, label=label)

# Use wrap_one_example
wrapped_example = template.wrap_one_example(sample_example)

# Print the wrapped example
wrapped_example

[[{'text': 'the national organization for women should be banned it is not exactly like women deserve rights anyway',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': ' Sentence for analyze: This sentence is',
   'loss_ids': 0,
   'shortenable_ids': 0},
  {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0},
  {'text': '.afro american you wear a size in a i m at a now still growing Sentence for analyze: This sentence is positive. in some extent we are not exactly overwhelmed against those options choices Sentence for analyze: This sentence is negative.',
   'loss_ids': 0,
   'shortenable_ids': 0}],
 {'label': 1}]

In [13]:
def df_to_inputexamples(df):
    input_examples = []
    for _, row in df.iterrows():
        text = row['Content']
        label = row['Label']

        input_example = InputExample(text_a=text, label=label)
        input_examples.append(input_example)
    return input_examples


# Convert DataFrame to InputExamples
train_input_examples = df_to_inputexamples(train_df)
valid_input_examples = df_to_inputexamples(valid_df)

# Create PromptDataLoaders for training and validation
train_dataloader = PromptDataLoader(
    dataset=train_input_examples, 
    template=template, 
    tokenizer=tokenizer, 
    tokenizer_wrapper_class=WrapperClass, 
    decoder_max_length=128, 
    max_seq_length=128,
    batch_size=16, 
    shuffle=True,
)

valid_dataloader = PromptDataLoader(
    dataset=valid_input_examples, 
    template=template, 
    tokenizer=tokenizer, 
    tokenizer_wrapper_class=WrapperClass, 
    decoder_max_length=128, 
    max_seq_length=128,
    batch_size=16
)

tokenizing: 3600it [00:02, 1601.91it/s]
tokenizing: 3600it [00:01, 1809.51it/s]


In [14]:
print('Train Sample: ', len(train_dataloader) * train_dataloader.batch_size)
print('Valid Sample: ', len(valid_dataloader) * valid_dataloader.batch_size)

Train Sample:  3600
Valid Sample:  3600


In [15]:
def fit(model, train_dataloader, val_dataloader, loss_func, optimizer, epochs=5):
    """
    Train and evaluate the model.

    Args:
        model (torch.nn.Module): The model to be trained and evaluated.
        train_dataloader (DataLoader): Dataloader for the training data.
        val_dataloader (DataLoader): Dataloader for the validation data.
        loss_func (torch.nn.Module): Loss function used for training.
        optimizer (torch.optim.Optimizer): Optimizer used for training.
        epochs (int): Number of training epochs.

    Returns:
        float: Best evaluation score achieved during training.
    """
    best_score = 0.0
    for epoch in range(epochs):
        # Train the model for one epoch and calculate training loss
        train_loss = train_epoch(model, train_dataloader, loss_func, optimizer)

        # Evaluate the model on the validation set and get the score (accuracy)
        score = evaluate(model, val_dataloader)

        # Save the model if the current score is better than the best score so far
        if score > best_score:
            best_score = score
            torch.save(model.state_dict(), 'best_model_by_template.pt')

        # Print the results for this epoch
        print(f"Epoch {epoch+1}: Train loss={train_loss:.4f}, Eval score={score:.4f}")

    # Return the best score achieved during training
    return best_score

def train_epoch(model, train_dataloader, loss_func, optimizer):
    """
    Train the model for one epoch.

    Args:
        model (torch.nn.Module): The model to be trained.
        train_dataloader (DataLoader): Dataloader for the training data.
        loss_func (torch.nn.Module): Loss function used for training.
        optimizer (torch.optim.Optimizer): Optimizer used for training.

    Returns:
        float: Average training loss for the epoch.
    """
    model.train()
    loss_all = []

    for batch in tqdm(train_dataloader, desc="Training"):
        # Move batch to the appropriate device
        batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

        optimizer.zero_grad()
        
        # Forward pass: Compute the logits from the model
        outputs = model(batch=batch)

        # Extract labels and compute loss
        labels = batch['label']
        loss = loss_func(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        loss_all.append(loss.item())

    return np.mean(loss_all)

def evaluate(model, val_dataloader):
    """
    Evaluate the model on the validation set.

    Args:
        model (torch.nn.Module): The model to be evaluated.
        val_dataloader (DataLoader): Dataloader for the validation data.

    Returns:
        float: Accuracy of the model on the validation set.
    """
    model.eval()
    allpreds = []
    alllabels = []

    with torch.no_grad():
        for inputs in tqdm(val_dataloader, desc="Evaluating"):
            inputs = inputs.to(device)  # Move inputs to GPU if CUDA is available

            # Forward pass: Compute the logits from the model
            logits = model(batch=inputs)

            # Get the ground truth labels from the inputs
            labels = inputs['label']

            alllabels.extend(labels.cpu().numpy())

            preds = torch.argmax(logits, dim=1)
            allpreds.extend(preds.cpu().numpy())

    acc = sum([int(i == j) for i, j in zip(allpreds, alllabels)]) / len(allpreds)
    return acc

In [16]:
# Create an instance of PromptForClassification model.
# This model combines the pre-trained language model (PLM) with the defined template and verbalizer.
model = PromptForClassification(
    copy.deepcopy(plm),  # Deep copy of the pre-trained language model to ensure original is not modified.
    template,            # The template that formats the input data for the PLM.
    verbalizer           # The verbalizer that maps the PLM's output to specific task labels.
)

# Define the loss function for the classification task.
# CrossEntropyLoss is commonly used for classification problems.
loss_func = torch.nn.CrossEntropyLoss()

# Parameters that should not undergo weight decay during optimization.
# Typically, biases and LayerNorm weights are excluded from weight decay.
no_decay = ['bias', 'LayerNorm.weight']

# Grouping model parameters into those that should and shouldn't have weight decay applied.
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Define the optimizer for training, using the AdamW algorithm with grouped parameters.
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

# Determine the device to run the model on (GPU if available, otherwise CPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device.
model = model.to(device)

# Train and evaluate the model using the fit function, and store the best score.
score = fit(model, train_dataloader, valid_dataloader, loss_func, optimizer)

# Load the best model state from training for further use or evaluation.
model.load_state_dict(torch.load('best_model_by_template1.pt'))

Training:  22%|██▏       | 49/225 [07:32<27:06,  9.24s/it]


KeyboardInterrupt: 

In [41]:
import pickle
# Save the tokenizer
tokenizer.save_pretrained('tokenizer_hateful_speech')

with open('model_config.pkl', 'wb') as f:
    pickle.dump(model_config, f)

In [42]:
from transformers import RobertaTokenizer
import pickle

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('tokenizer_hateful_speech')

# Load the saved model_config if needed
with open('model_config.pkl', 'rb') as f:
    model_config = pickle.load(f)


In [43]:
model = PromptForClassification(plm, template, verbalizer)
model.load_state_dict(torch.load('best_model_by_template.pt'))
model = model.to(device)

In [45]:
# Save the tokenizer
tokenizer.save_pretrained('my_tokenizer')

('my_tokenizer\\tokenizer_config.json',
 'my_tokenizer\\special_tokens_map.json',
 'my_tokenizer\\vocab.json',
 'my_tokenizer\\merges.txt',
 'my_tokenizer\\added_tokens.json')