# Main imports and code

In [1]:
# check which gpu we're using
!nvidia-smi

Thu Feb 13 13:17:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install simpletransformers
!pip install tensorboardx
!pip install transformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets-

In [25]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
from transformers import pipeline

In [26]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  False


In [27]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

# Fetch Don't Patronize Me! data manager module

In [28]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [29]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [30]:
from dont_patronize_me import DontPatronizeMe

In [31]:
dpm = DontPatronizeMe('.', '.')

In [10]:
# Import the files below (can be found in spec github)
# Should remove this step in final so can be run end to end w/o invervention

from google.colab import files
uploaded = files.upload()

'''
train_semeval_parids-labels.csv
dev_semeval_parids-labels.csv
dontpatronizeme_pcl.tsv
dontpatronizeme_unlabeled_pcl.tv
'''

Saving train_semeval_parids-labels.csv to train_semeval_parids-labels.csv
Saving dev_semeval_parids-labels.csv to dev_semeval_parids-labels.csv
Saving dontpatronizeme_categories.tsv to dontpatronizeme_categories.tsv
Saving dontpatronizeme_pcl.tsv to dontpatronizeme_pcl.tsv


'\ntrain_semeval_parids-labels.csv\ndev_semeval_parids-labels.csv\ndontpatronizeme_pcl.tsv\ndontpatronizeme_unlabeled_pcl.tv\n'

In [32]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [33]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [34]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [35]:
data=dpm.train_task1_df

In [36]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [37]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [38]:
import random

In [39]:
trdf1 = pd.DataFrame(rows)

In [40]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


In [20]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from transformers import pipeline

In [21]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [22]:
# Initialise bert-base-uncased to predict missing words
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Replace max 5 words with synonyms
def synonym_replacement(text):
    num_replacements = 5

    # tokenise with nltk
    words = word_tokenize(text)

    # Map words to their 'POS tag'
    # [Noun -> NN, Adj -> JJ, Adverb -> RB, Verb -> VB, Pronoun -> PRP, Determiner -> DT]
    pos_tags = pos_tag(words)

    # Find candidate words for replacement (adjectives or adverbs only)
    candidates = [word for word, tag in pos_tags if tag in ["JJ", "RB"]]
    random.shuffle(candidates)

    # Select 5 words to replace
    selected_words = candidates[:num_replacements]

    # Initialise list for augmented text
    new_texts = []

    # For each word we will replace
    for word_to_replace in selected_words:

        # Get sentence with old word replaced by "[MASK]"
        masked_sentence = text.replace(word_to_replace, "[MASK]", 1)

        # Attemept to find synonym and replace
        try:
            # Get synonyms
            predictions = fill_mask(masked_sentence)
            synonyms = [pred["token_str"] for pred in predictions[:5]]

            # Choose random synonym and replace with it
            new_word = random.choice(synonyms)
            augmented_text = text.replace(word_to_replace, new_word, 1)

            # Add augmented text to new_texts list
            new_texts.append(augmented_text)
        except:
            continue
    # Return new generated texts (or old one if non generated)
    return new_texts if new_texts else [text]




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


In [43]:
# Initialise augmented data as empty
augmented_data = []

# Get most recent par_id and add one for new text to use
curr_par_id = int(trdf1['par_id'].iloc[-1]) + 1

# For each paragraph in the training data
for _, row in trdf1.iterrows():
    # Generate 5 augmented (synonym replaced) new paragraphs
    augmented_texts = synonym_replacement(row["text"])

    #
    for aug_text in augmented_texts:
        augmented_data.append({"par_id": str(curr_par_id),
                               "community": row["community"],
                               "text": aug_text,
                               "label": row["label"]})
        curr_par_id += 1

# Convert to DataFrame
augmented_trdf1 = pd.DataFrame(augmented_data)

# Combine original and augmented datasets
trdf1 = pd.concat([trdf1, augmented_trdf1]).drop_duplicates().reset_index(drop=True)

# Save augmented dataset
trdf1.to_csv("augmented_pcl_dataset.csv", index=False)

In [44]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
197241,197251,disabled,but there was one occasion when we went to the...,0
197242,197252,disabled,but there was one occasion when we went to the...,0
197243,197253,disabled,well there was one occasion when we went to th...,0
197244,197254,disabled,Yet there was one occasion when we went to the...,0


In [41]:
# Once we have run the above once we can just load from the file
trdf1 = pd.read_csv('synonym_replaced_dataset.csv')

In [42]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
197241,197251,disabled,but there was one occasion when we went to the...,0
197242,197252,disabled,but there was one occasion when we went to the...,0
197243,197253,disabled,well there was one occasion when we went to th...,0
197244,197254,disabled,Yet there was one occasion when we went to the...,0


# Rebuild test set (Task 1)

In [43]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [44]:
len(rows)

2094

In [45]:
tedf1 = pd.DataFrame(rows)

In [46]:
tedf1 = tedf1.sample(frac=1, random_state=42).reset_index(drop=True)

# RoBERTa Baseline for Task 1

In [47]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [48]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
44675,44685,poor-families,"""According to Santhosh Ramdoss , director , In...",0
44676,44686,poor-families,"""According to Santhosh Ramdoss , director , In...",0
44677,44687,in-need,"In Wajir , more than 100,000 people are in nee...",0
44678,44688,immigrant,"The story begins on October 16 , 1817 with a s...",0


In [None]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
114it [02:18,  1.22s/it]                                                       
Epoch 1 of 1:   0%|                                      | 0/1 [00:00<?, ?it/s]
Running Epoch 1 of 1:   0%|                           | 0/7120 [00:00<?, ?it/s][A
Epochs 1/1. Running Loss:    0.6967:   0%|            | 0/7120 [00:05<?, ?it/s][A
Epochs 1/1. Running Loss:    0.6967:   0%| | 1/7120 [00:09<18:44:39,  9.48s/it][A
Epochs 1/1. Running Loss:    0.7328:   0%| | 1/7120 [00:11<18:44:39,  9.48s/it][A
Epochs 1/1. Running Loss:    0.7328

In [40]:
Counter(preds_task1)

Counter({0: 1963, 1: 131})

In [41]:
labels2file([[k] for k in preds_task1], 'task1.txt')

In [42]:
from sklearn.metrics import f1_score

y_true = tedf1.label.tolist()
y_pred = preds_task1

f1 = f1_score(y_true, y_pred)
print("F1 Score:", f1)

F1 Score: 0.503030303030303
