# For colab environment

In [1]:
# !pip install nltk transformers==4.35.0 torch==2.6.0 torchvision==0.21.0 datasets accelerate==0.24.0 huggingface==0.0.1 datasets==2.14.7

In [1]:
import torch 
print(torch.cuda.is_available())
print(torch.__version__)

True
2.6.0+cu124


In [3]:
# !git clone https://github.com/BernardMoy/NLP-PCL-Classification.git

In [4]:
# %cd NLP-PCL-Classification/

In [2]:
!nvidia-smi

Sun Mar  1 13:05:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN Xp                Off |   00000000:02:00.0  On |                  N/A |
| 24%   38C    P8             18W /  250W |     506MiB /  12288MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Load train and validation data set

In [3]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
from collections import Counter

df = pd.read_csv('data/dontpatronizeme_pcl.tsv', sep='\t')

# Remove rows with NA labels 
df = df.dropna() 

# Add a bool_labels column for binary classification
df["bool_labels"] = df["label"] > 1   # is PCL if >1

# train val split 
train_labels = pd.read_csv('data/train_semeval_parids-labels.csv')["par_id"]
val_labels = pd.read_csv('data/dev_semeval_parids-labels.csv')["par_id"]
df_train = df[df["par_id"].isin(train_labels)].reset_index() 
df_val = df[df["par_id"].isin(val_labels)].reset_index() 


# Text Cleaning

In [4]:
# Remove special characters
SPECIAL_CHARACTERS = ['&amp;', '&lt;', '&gt;', '<h>', '\n', '\t']
for char in SPECIAL_CHARACTERS:
    df_train["text"] = df_train["text"].str.replace(char, "")
    df_val["text"] = df_val["text"].str.replace(char, "")


print(df_train["text"].iloc[55])


# Replace numbers with 0
df_train["text"] = df_train["text"].str.replace(r"\d+", "0", regex=True)
df_val["text"] = df_val["text"].str.replace(r"\d+", "0", regex=True)

print(df_train["text"].iloc[3])

People who are homeless , those who were once homeless , those working with the homeless and concerned New Zealanders are being asked to share their experiences and solutions to this growing issue with the Cross-Party Homelessness Inquiry . More
Council customers only signs would be displayed . Two of the spaces would be reserved for disabled persons and there would be five P0 spaces and eight P0 ones .


# Oversample the minority class
For each keyword category, inflate the number of positive examples to a certain percentage

In [5]:
POSITIVE_PERCENTAGE = 25


# Find all the unique keywords in the training dataset
keywords = pd.unique(df_train["keyword"])


# Extract the sub-dataset for each keyword
for keyword in keywords:
    subdata = df_train[df_train["keyword"] == keyword]
    rows = subdata.shape[0]


    # Find the number of positive entires x
    subdata_positive = subdata[subdata["bool_labels"] == True]
    positive_rows = subdata_positive.shape[0]


    # Calculate the number of additional samples needed to make the positive class reach the desired percentage
    # (p+x)/(r+x) = POS PERCENTAGE
    n_samples = round((100*positive_rows-POSITIVE_PERCENTAGE*rows)/(POSITIVE_PERCENTAGE-100)*1.0)


    # Sample with replacement from the sub dataset and add new rows
    sampled = subdata_positive.sample(n_samples, replace=True).reset_index(drop=True)
   
    # concat with the main df
    df_train = pd.concat([df_train, sampled], ignore_index=True)


df_train["bool_labels"].value_counts()


bool_labels
False    7581
True     2527
Name: count, dtype: int64

# Coreference resolution

In [6]:
from fastcoref import FCoref

# define the model once
model = FCoref(device='cuda:0' if torch.cuda.is_available() else 'cpu')


def coreference_resolution(model, text):
    # Batch coreference resolution for all texts 
    preds = model.predict(
        texts = text
    )

    # Iterate each row of the list to substitute the pronouns / references with entity names 
    result = [] 
    for i in range(len(text)): 
        sent = text[i] 
        clusters = preds[i].get_clusters(as_strings = False) 

        # create mappings from each pronoun indices -> entities TEXT
        d = {}
        for cluster in clusters:
            entity = cluster[0]    # IMPORTANT - The first entity is assumed to be the main entity here. Use POS tagging to further improve this. 
            refs = cluster[1::]

            for ref in refs: 
                d[ref] = sent[entity[0]:entity[1]]

        # for each pronoun index (key), replace by their entity text (value) 
        sorted_keys = sorted(d.keys(), reverse = True) 
        for key in sorted_keys: 
            start, end = key 
            sent = sent[:start] + d[key] + sent[end:]

        result.append(sent) 

    return result 

   

test = coreference_resolution(model, ['We are so happy to see you using our coref package . This package is very fast !', 
                                     'He said the CEO of Apple was happy. Tim Cook later confirmed it .', 
                                     "Dr. Lester Keith , doctor and professor of business administration , and others are checking with local transportation groups to see if they can bring those in need of a meal to the college for the 4 p.m. dinner . We will also be contacting local soup kitchens as a pickup location and will work with them to transport any leftovers to them so there is no wasted food , Dr. Keith said ."])
print(test[0]) 
print(test[1]) 
print(test[2])


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
03/01/2026 13:06:09 - INFO - 	 missing_keys: []
03/01/2026 13:06:09 - INFO - 	 unexpected_keys: []
03/01/2026 13:06:09 - INFO - 	 mismatched_keys: []
03/01/2026 13:06:09 - INFO - 	 error_msgs: []
03/01/2026 13:06:09 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/01/2026 13:06:09 - INFO - 	 Tokenize 3 inputs...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

03/01/2026 13:06:09 - INFO - 	 ***** Running Inference on 3 texts *****


Inference:   0%|          | 0/3 [00:00<?, ?it/s]

We are so happy to see you using We coref package . our coref package is very fast !
He said the CEO of Apple was happy. the CEO of Apple later confirmed it .
Dr. Lester Keith , doctor and professor of business administration , and others are checking with local transportation groups to see if local transportation groups can bring those in need of a meal to the college for the 4 p.m. dinner . Dr. Lester Keith , doctor and professor of business administration , and others will also be contacting local soup kitchens as a pickup location and will work with local soup kitchens to transport any leftovers to local soup kitchens so there is no wasted food , Dr. Lester Keith , doctor and professor of business administration said .


In [7]:
train_coref = coreference_resolution(model, df_train["text"].tolist())
df_train["text_cr"] = pd.Series(train_coref) 

val_coref = coreference_resolution(model, df_val["text"].tolist())
df_val["text_cr"] = pd.Series(val_coref) 

03/01/2026 13:06:09 - INFO - 	 Tokenize 10108 inputs...


Map:   0%|          | 0/10108 [00:00<?, ? examples/s]

03/01/2026 13:06:25 - INFO - 	 ***** Running Inference on 10108 texts *****


Inference:   0%|          | 0/10108 [00:00<?, ?it/s]

03/01/2026 13:06:43 - INFO - 	 Tokenize 2093 inputs...


Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

03/01/2026 13:06:47 - INFO - 	 ***** Running Inference on 2093 texts *****


Inference:   0%|          | 0/2093 [00:00<?, ?it/s]

In [8]:
print(df_train["text"].iloc[23])
print(df_train["text_cr"].iloc[23])

print(df_val["text"].iloc[23])
print(df_val["text_cr"].iloc[23])

" The regional brands so far lag behind the global and big international Chinese handset players in 0G and they have looked vulnerable to failing to jump the generation successfully and lose their place . "
" The regional brands so far lag behind the global and big international Chinese handset players in 0G and The regional brands have looked vulnerable to failing to jump the generation successfully and lose The regional brands place . "
BUSINESSMAN Norberto Quisumbing Jr . of the Norkis Group of Companies has a challenge for families who can spare some of what they have : why not adopt poor families and help them break the cycle of poverty ?
BUSINESSMAN Norberto Quisumbing Jr . of the Norkis Group of Companies has a challenge for families who can spare some of what families who can spare some of what they have have : why not adopt poor families and help poor families break the cycle of poverty ?


# Add contextual information to the text tokens

In [9]:
def add_info(df): 
    # Append the keyword and country code to the text, and separate them with additional separator tokens
    # Remove dashes in the keyword to match the format in the texts 
    return df["keyword"].str.replace('-', " ") + "[SEP]" + df["country_code"] + "[SEP]" + df["text_cr"]

# Tokenization

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig, Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 

# Create text with contextual information 
def tokenize(df): 
    text_with_context = add_info(df) 

    encoding = tokenizer(
        text_with_context.tolist(), 
        padding="max_length",   # Add padding to shorter sentences 
        max_length=256,
        truncation = True, 
        return_attention_mask = True 
    )

    return encoding

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Convert to pyTorch dataset

In [12]:
import torch 
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset

def to_dataset(df): 
    # Obtain tokens (input_ids, attention_mask) from the dataset 
    encoding = tokenize(df) 

    # Return huggingface dataset 
    return Dataset.from_dict({
        "input_ids": encoding["input_ids"], 
        "attention_mask": encoding["attention_mask"], 
        "label": df["label"].values 
    })

In [13]:
train_dataset = to_dataset(df_train)
val_dataset = to_dataset(df_val) 

# set to torch format 
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Training 

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics 
    accuracy = accuracy_score(labels, predictions) 
    precision = precision_score(labels, predictions, average='macro') 
    recall = recall_score(labels, predictions, average='macro') 
    f1 = f1_score(labels, predictions, average='macro') 

    return {
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1 
    }


In [24]:
# Load roberta sequence classification model 
config = AutoConfig.from_pretrained("bert-base-uncased", num_labels=5)  # Predict labels instead of binary classification which is done later 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config = config)
model.resize_token_embeddings(len(tokenizer)) 

# Core hyperparameters 
BATCH_SIZE = 32
N_EPOCHS = 5 

# Set up training arguments 
training_args = TrainingArguments(
    fp16=True, 
    num_train_epochs=N_EPOCHS, 
    learning_rate=2e-5, 
    weight_decay=0.01,
    warmup_steps=500, 
    save_strategy="epoch",  # low disk space 
    load_best_model_at_end=True, 
    metric_for_best_model='f1',
    logging_steps=50,
    output_dir="./checkpoints/bert_improved", 
    evaluation_strategy="epoch", 
    per_device_eval_batch_size=BATCH_SIZE, 
    per_device_train_batch_size=BATCH_SIZE, 
)

# Set up trainer 
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    compute_metrics=compute_metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [25]:
trainer.train() 

{'loss': 1.7987, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.16}
{'loss': 1.3142, 'learning_rate': 3.96e-06, 'epoch': 0.32}
{'loss': 1.0449, 'learning_rate': 5.9600000000000005e-06, 'epoch': 0.47}
{'loss': 0.9498, 'learning_rate': 7.960000000000002e-06, 'epoch': 0.63}
{'loss': 0.9078, 'learning_rate': 9.920000000000002e-06, 'epoch': 0.79}
{'loss': 0.8397, 'learning_rate': 1.1920000000000001e-05, 'epoch': 0.95}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'eval_loss': 0.5948612093925476, 'eval_accuracy': 0.8103201146679407, 'eval_precision': 0.215835533760845, 'eval_recall': 0.2758489644843535, 'eval_f1': 0.23956264624178933, 'eval_runtime': 17.5729, 'eval_samples_per_second': 119.104, 'eval_steps_per_second': 3.756, 'epoch': 1.0}
{'loss': 0.7876, 'learning_rate': 1.392e-05, 'epoch': 1.11}
{'loss': 0.7259, 'learning_rate': 1.5920000000000003e-05, 'epoch': 1.27}
{'loss': 0.7082, 'learning_rate': 1.792e-05, 'epoch': 1.42}
{'loss': 0.6669, 'learning_rate': 1.9920000000000002e-05, 'epoch': 1.58}
{'loss': 0.6064, 'learning_rate': 1.9111111111111113e-05, 'epoch': 1.74}
{'loss': 0.6361, 'learning_rate': 1.8185185185185186e-05, 'epoch': 1.9}
{'eval_loss': 0.5668193101882935, 'eval_accuracy': 0.8155757286192069, 'eval_precision': 0.2984151341843481, 'eval_recall': 0.3356793566111242, 'eval_f1': 0.3159116659424944, 'eval_runtime': 17.4032, 'eval_samples_per_second': 120.265, 'eval_steps_per_second': 3.792, 'epoch': 2.0}
{'loss': 0.5387, 'learnin

TrainOutput(global_step=1580, training_loss=0.5241040136240706, metrics={'train_runtime': 1344.6331, 'train_samples_per_second': 37.586, 'train_steps_per_second': 1.175, 'train_loss': 0.5241040136240706, 'epoch': 5.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.7682408094406128, 'eval_accuracy': 0.7978977544194935, 'eval_precision': 0.40264876908068026, 'eval_recall': 0.3477541401016615, 'eval_f1': 0.36613421906193344, 'eval_runtime': 16.6769, 'eval_samples_per_second': 125.503, 'eval_steps_per_second': 3.958, 'epoch': 5.0}


{'eval_loss': 0.7682408094406128,
 'eval_accuracy': 0.7978977544194935,
 'eval_precision': 0.40264876908068026,
 'eval_recall': 0.3477541401016615,
 'eval_f1': 0.36613421906193344,
 'eval_runtime': 16.6769,
 'eval_samples_per_second': 125.503,
 'eval_steps_per_second': 3.958,
 'epoch': 5.0}

# Save model

In [27]:
trainer.save_model('models/model_bert_improved')