# For colab environment

In [2]:
# !pip install nltk transformers==4.35.0 torch==2.6.0 torchvision==0.21.0 datasets accelerate==0.24.0 huggingface==0.0.1 datasets==2.14.7

In [3]:
import torch 
print(torch.cuda.is_available())
print(torch.__version__)

True
2.6.0+cu124


In [4]:
# !git clone https://github.com/BernardMoy/NLP-PCL-Classification.git

In [5]:
# %cd NLP-PCL-Classification/

In [6]:
!nvidia-smi

Sat Feb 28 21:17:03 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX TITAN X     Off |   00000000:02:00.0  On |                  N/A |
| 22%   44C    P8             31W /  250W |     172MiB /  12288MiB |     26%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Load train and validation data set

In [7]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
from collections import Counter

df = pd.read_csv('data/dontpatronizeme_pcl.tsv', sep='\t')

# Remove rows with NA labels 
df = df.dropna() 

# Add a bool_labels column for binary classification
df["bool_labels"] = df["label"] > 1   # is PCL if >1

# train val split 
train_labels = pd.read_csv('data/train_semeval_parids-labels.csv')["par_id"]
val_labels = pd.read_csv('data/dev_semeval_parids-labels.csv')["par_id"]
df_train = df[df["par_id"].isin(train_labels)].reset_index() 
df_val = df[df["par_id"].isin(val_labels)].reset_index() 


# Oversample the minority class
For each keyword category, inflate the number of positive examples to a certain percentage

In [8]:
POSITIVE_PERCENTAGE = 25


# Coreference resolution

In [9]:
from fastcoref import FCoref

# define the model once
model = FCoref(device='cuda:0' if torch.cuda.is_available() else 'cpu')


def coreference_resolution(model, text):
    # Batch coreference resolution for all texts 
    preds = model.predict(
        texts = text
    )

    # Iterate each row of the list to substitute the pronouns / references with entity names 
    result = [] 
    for i in range(len(text)): 
        sent = text[i] 
        clusters = preds[i].get_clusters(as_strings = False) 

        # create mappings from each pronoun indices -> entities TEXT
        d = {}
        for cluster in clusters:
            entity = cluster[0]    # IMPORTANT - The first entity is assumed to be the main entity here. Use POS tagging to further improve this. 
            refs = cluster[1::]

            for ref in refs: 
                d[ref] = sent[entity[0]:entity[1]]

        # for each pronoun index (key), replace by their entity text (value) 
        sorted_keys = sorted(d.keys(), reverse = True) 
        for key in sorted_keys: 
            start, end = key 
            sent = sent[:start] + d[key] + sent[end:]

        result.append(sent) 

    return result 

   

test = coreference_resolution(model, ['We are so happy to see you using our coref package . This package is very fast !', 
                                     'He said the CEO of Apple was happy. Tim Cook later confirmed it .', 
                                     "Dr. Lester Keith , doctor and professor of business administration , and others are checking with local transportation groups to see if they can bring those in need of a meal to the college for the 4 p.m. dinner . We will also be contacting local soup kitchens as a pickup location and will work with them to transport any leftovers to them so there is no wasted food , Dr. Keith said ."])
print(test[0]) 
print(test[1]) 
print(test[2])


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
02/28/2026 21:17:38 - INFO - 	 missing_keys: []
02/28/2026 21:17:38 - INFO - 	 unexpected_keys: []
02/28/2026 21:17:38 - INFO - 	 mismatched_keys: []
02/28/2026 21:17:38 - INFO - 	 error_msgs: []
02/28/2026 21:17:38 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
02/28/2026 21:17:38 - INFO - 	 Tokenize 3 inputs...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

02/28/2026 21:17:39 - INFO - 	 ***** Running Inference on 3 texts *****


Inference:   0%|          | 0/3 [00:00<?, ?it/s]

We are so happy to see you using We coref package . our coref package is very fast !
He said the CEO of Apple was happy. the CEO of Apple later confirmed it .
Dr. Lester Keith , doctor and professor of business administration , and others are checking with local transportation groups to see if local transportation groups can bring those in need of a meal to the college for the 4 p.m. dinner . Dr. Lester Keith , doctor and professor of business administration , and others will also be contacting local soup kitchens as a pickup location and will work with local soup kitchens to transport any leftovers to local soup kitchens so there is no wasted food , Dr. Lester Keith , doctor and professor of business administration said .


In [10]:
train_coref = coreference_resolution(model, df_train["text"].tolist())
df_train["text_cr"] = pd.Series(train_coref) 

val_coref = coreference_resolution(model, df_val["text"].tolist())
df_val["text_cr"] = pd.Series(val_coref) 

02/28/2026 21:17:39 - INFO - 	 Tokenize 8375 inputs...


Map:   0%|          | 0/8375 [00:00<?, ? examples/s]

02/28/2026 21:17:52 - INFO - 	 ***** Running Inference on 8375 texts *****


Inference:   0%|          | 0/8375 [00:00<?, ?it/s]

02/28/2026 21:18:12 - INFO - 	 Tokenize 2093 inputs...


Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

02/28/2026 21:18:16 - INFO - 	 ***** Running Inference on 2093 texts *****


Inference:   0%|          | 0/2093 [00:00<?, ?it/s]

In [11]:
print(df_train["text"].iloc[23])
print(df_train["text_cr"].iloc[23])

print(df_val["text"].iloc[23])
print(df_val["text_cr"].iloc[23])

" The regional brands so far lag behind the global and big international Chinese handset players in 4G and they have looked vulnerable to failing to jump the generation successfully and lose their place . "
" The regional brands so far lag behind the global and big international Chinese handset players in 4G and The regional brands have looked vulnerable to failing to jump the generation successfully and lose The regional brands place . "
BUSINESSMAN Norberto Quisumbing Jr . of the Norkis Group of Companies has a challenge for families who can spare some of what they have : why not adopt poor families and help them break the cycle of poverty ?
BUSINESSMAN Norberto Quisumbing Jr . of the Norkis Group of Companies has a challenge for families who can spare some of what families who can spare some of what they have have : why not adopt poor families and help poor families break the cycle of poverty ?


# Add contextual information to the text tokens

In [12]:
def add_info(df): 
    # Append the keyword and country code to the text, and separate them with roberta separators </s><s> 
    return df["keyword"] + "</s><s>" + df["country_code"] + "</s><s>" + df["text_cr"]

# Tokenization

In [13]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 

# Create text with contextual information 
def tokenize(df): 
    text_with_context = add_info(df) 

    encoding = tokenizer(
        text_with_context.tolist(), 
        padding="max_length",   # Add padding to shorter sentences 
        max_length=256,
        truncation = True, 
        return_attention_mask = True 
    )

    return encoding

# Convert to pyTorch dataset

In [14]:
import torch 
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset

def to_dataset(df): 
    # Obtain tokens (input_ids, attention_mask) from the dataset 
    encoding = tokenize(df) 

    # Return huggingface dataset 
    return Dataset.from_dict({
        "input_ids": encoding["input_ids"], 
        "attention_mask": encoding["attention_mask"], 
        "label": df["bool_labels"].values 
    })

In [15]:
train_dataset = to_dataset(df_train)
val_dataset = to_dataset(df_val) 

# set to torch format 
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Training 

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics 
    accuracy = accuracy_score(labels, predictions) 
    precision = precision_score(labels, predictions) 
    recall = recall_score(labels, predictions) 
    f1 = f1_score(labels, predictions) 

    return {
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1 
    }


In [17]:
# Load roberta sequence classification model 
config = AutoConfig.from_pretrained("roberta-base", num_labels=2)  # Binary classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config = config)

# Core hyperparameters 
BATCH_SIZE = 32
N_EPOCHS = 5 

# Set up training arguments 
training_args = TrainingArguments(
    fp16=True, 
    num_train_epochs=N_EPOCHS, 
    learning_rate=2e-5, 
    weight_decay=0.01,
    warmup_steps=500, 
    save_strategy="no",  # low disk space 
    load_best_model_at_end=True, 
    logging_steps=50,
    output_dir="./predictions", 
    evaluation_strategy="no", 
    per_device_eval_batch_size=BATCH_SIZE, 
    per_device_train_batch_size=BATCH_SIZE, 
)

# Set up trainer 
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    compute_metrics=compute_metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [18]:
trainer.train() 

{'loss': 0.7168, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.19}
{'loss': 0.4227, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.38}
{'loss': 0.2993, 'learning_rate': 5.9600000000000005e-06, 'epoch': 0.57}
{'loss': 0.2531, 'learning_rate': 7.960000000000002e-06, 'epoch': 0.76}
{'loss': 0.2307, 'learning_rate': 9.960000000000001e-06, 'epoch': 0.95}
{'loss': 0.2168, 'learning_rate': 1.196e-05, 'epoch': 1.15}
{'loss': 0.2268, 'learning_rate': 1.396e-05, 'epoch': 1.34}
{'loss': 0.1873, 'learning_rate': 1.5960000000000003e-05, 'epoch': 1.53}
{'loss': 0.2218, 'learning_rate': 1.796e-05, 'epoch': 1.72}
{'loss': 0.1972, 'learning_rate': 1.9960000000000002e-05, 'epoch': 1.91}
{'loss': 0.1726, 'learning_rate': 1.8790123456790124e-05, 'epoch': 2.1}
{'loss': 0.1481, 'learning_rate': 1.7580246913580247e-05, 'epoch': 2.29}
{'loss': 0.1137, 'learning_rate': 1.6345679012345682e-05, 'epoch': 2.48}
{'loss': 0.1891, 'learning_rate': 1.5111111111111112e-05, 'epoch': 2.67}
{'loss': 0.1583, 'lea

TrainOutput(global_step=1310, training_loss=0.173153081877541, metrics={'train_runtime': 1717.1918, 'train_samples_per_second': 24.386, 'train_steps_per_second': 0.763, 'train_loss': 0.173153081877541, 'epoch': 5.0})

In [19]:
trainer.evaluate()

{'eval_loss': 0.34337538480758667, 'eval_accuracy': 0.9211657907310081, 'eval_precision': 0.5955056179775281, 'eval_recall': 0.5326633165829145, 'eval_f1': 0.5623342175066313, 'eval_runtime': 28.8834, 'eval_samples_per_second': 72.464, 'eval_steps_per_second': 2.285, 'epoch': 5.0}


{'eval_loss': 0.34337538480758667,
 'eval_accuracy': 0.9211657907310081,
 'eval_precision': 0.5955056179775281,
 'eval_recall': 0.5326633165829145,
 'eval_f1': 0.5623342175066313,
 'eval_runtime': 28.8834,
 'eval_samples_per_second': 72.464,
 'eval_steps_per_second': 2.285,
 'epoch': 5.0}