In [10]:
!pip install nltk transformers datasets torch

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [11]:
!python -m pip install transformers accelerate




[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load train and validation data set

In [12]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
from collections import Counter

df = pd.read_csv('data/dontpatronizeme_pcl.tsv', sep='\t')

# Remove rows with NA labels 
df = df.dropna() 

# train val split 
train_labels = pd.read_csv('data/train_semeval_parids-labels.csv')["par_id"]
val_labels = pd.read_csv('data/dev_semeval_parids-labels.csv')["par_id"]
df_train = df[df["par_id"].isin(train_labels)]
df_val = df[df["par_id"].isin(val_labels)]


In [13]:
text_with_context = (
    df_val["keyword"] + "</s><s>" +
    df_val["country_code"] + "</s><s>" +
    df_val["text"]
)

for i, x in enumerate(text_with_context):
    if not isinstance(x, str):
        print("Bad row:", i, type(x))

In [14]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 

# Create text with contextual information 
def tokenize(df): 
    text_with_context = df["keyword"] + "</s><s>" + df["country_code"] + "</s><s>" + df["text"]

    encoding = tokenizer(
        text_with_context.tolist(), 
        padding="max_length",   # Add padding to shorter sentences 
        truncation = True, 
        return_attention_mask = True 
    )

    return encoding

# Tokenization

In [15]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, Trainer, TrainingArguments

tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 

# Create text with contextual information 
def tokenize(df): 
    text_with_context = df["keyword"] + "</s><s>" + df["country_code"] + "</s><s>" + df["text"]

    encoding = tokenizer(
        text_with_context.tolist(), 
        padding="max_length",   # Add padding to shorter sentences 
        truncation = True, 
        return_attention_mask = True 
    )

    return encoding

# Convert to pyTorch dataset

In [16]:
BATCH_SIZE = 16 
import torch 
from torch.utils.data import DataLoader, TensorDataset

def to_dataset(df): 
    encoding = tokenize(df) 

    # Create torch tensors 
    inputs = torch.Tensor(encoding["input_ids"])
    attention = torch.Tensor(encoding["attention_mask"])
    labels = torch.Tensor(df["label"].values)

    dataset = TensorDataset(inputs, attention, labels) 
    dataLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    return dataset, dataLoader

In [17]:
train_dataset, train_dataLoader = to_dataset(df_train) 
val_dataset, val_dataLoader = to_dataset(df_val) 

# Training 

In [19]:
# Load roberta sequence classification model 
config = AutoConfig.from_pretrained("roberta-base", num_labels=2)  # Binary classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config = config)

# Set up training arguments 
training_args = TrainingArguments(
    num_train_epochs=5, 
    learning_rate=2e-5, 
    weight_decay=0.01,
    warmup_steps=500, 
    save_strategy="epoch", 
    load_best_model_at_end=True, 
    logging_steps=50,
    output_dir="./predictions", 
    eval_strategy="epoch", 
    per_device_eval_batch_size=8, 
    per_device_train_batch_size=8
)

# Set up trainer 
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset
)


Loading weights: 100%|██████████| 197/197 [00:00<00:00, 844.65it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the

In [20]:
trainer.train() 



TypeError: vars() argument must have __dict__ attribute