In [12]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [13]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Cuda available?  False


In [14]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [15]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [16]:
from dont_patronize_me import DontPatronizeMe
path = '/Users/gouse/PycharmProjects/NLP/labs_2023_2024/CW/data'
dpm = DontPatronizeMe(path,path)
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


In [17]:
trids = pd.read_csv(path+'/train_semeval_parids-labels.csv')
teids = pd.read_csv(path+'/dev_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)
data=dpm.train_task1_df
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


#### Train

In [57]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  country = data.loc[data.par_id == parid].country.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      'country':country
  })
import random
trdf1 = pd.DataFrame(rows)
# Convert countries to categtorical label value 
trdf1["country"] = pd.Categorical(trdf1["country"], categories=trdf1["country"].unique()).codes
trdf1["country"].unique()
trdf1

Unnamed: 0,par_id,community,text,label,country
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1,0
1,4136,homeless,Durban 's homeless communities reconciliation ...,1,1
2,10352,poor-families,The next immediate problem that cropped up was...,1,2
3,8279,vulnerable,Far more important than the implications for t...,1,3
4,1164,poor-families,To strengthen child-sensitive social protectio...,1,4
...,...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0,19
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0,14
8373,8383,hopeless,You have to see it from my perspective . I may...,0,14


In [58]:
from sklearn.preprocessing import OneHotEncoder
X = trdf1['country'].values.reshape(-1, 1)
enc = OneHotEncoder().fit(X)

X = enc.transform(X).toarray()
trdf1['country'] = pd.Series(X.tolist())
trdf1

Unnamed: 0,par_id,community,text,label,country
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,4136,homeless,Durban 's homeless communities reconciliation ...,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,10352,poor-families,The next immediate problem that cropped up was...,1,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,8279,vulnerable,Far more important than the implications for t...,1,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1164,poor-families,To strengthen child-sensitive social protectio...,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8373,8383,hopeless,You have to see it from my perspective . I may...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [61]:
path = '/Users/gouse/PycharmProjects/NLP/labs_2023_2024/CW/data/baseline_preprocessed_csvs'
trdf1.to_csv(path+'/trdf1_countries.csv', index=False, encoding='utf-8')

#### Test

In [62]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  country = data.loc[data.par_id == parid].country.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      'country':country
  })
tedf1 = pd.DataFrame(rows)
# Convert countries to categtorical label value 
tedf1["country"] = pd.Categorical(tedf1["country"], categories=tedf1["country"].unique()).codes
tedf1["country"].unique()
tedf1

Unnamed: 0,par_id,community,text,label,country
0,4046,hopeless,We also know that they can benefit by receivin...,1,0
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1,1
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1,1
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1,2
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1,3
...,...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0,16
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0,3
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0,2
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0,9


In [63]:
from sklearn.preprocessing import OneHotEncoder
X = tedf1['country'].values.reshape(-1, 1)
enc = OneHotEncoder().fit(X)

X = enc.transform(X).toarray()
tedf1['country'] = pd.Series(X.tolist())
tedf1

Unnamed: 0,par_id,community,text,label,country
0,4046,hopeless,We also know that they can benefit by receivin...,1,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [64]:
path = '/Users/gouse/PycharmProjects/NLP/labs_2023_2024/CW/data/baseline_preprocessed_csvs'
tedf1.to_csv(path+'/tedf1_countries.csv', index=False, encoding='utf-8')

#### Roberta task 1

In [11]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [13]:
print(len(training_set1))
print(len(trdf1[trdf1.label==1]))
print(len(trdf1[trdf1.label==0]))

2382
794
7581


In [14]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [15]:
from sklearn.metrics import f1_score, accuracy_score

task1_model_args = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/298 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/21 [00:00<?, ?it/s]

In [16]:
print(preds_task1)
print(Counter(preds_task1))
print(labels2file([[k] for k in preds_task1], 'task1.txt'))

[0 0 0 ... 0 0 0]
Counter({0: 1784, 1: 310})
None


In [17]:
from sklearn.metrics import f1_score

print(f"F1-score on dev:{f1_score(tedf1.label, preds_task1)}")

F1-score on dev:0.550098231827112


In [18]:
preds_task1

array([0, 0, 0, ..., 0, 0, 0])

**Rebuild training set (Task 2)**

In [20]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [21]:
trdf2 = pd.DataFrame(rows2)
trdf2

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,Durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,The next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,Far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,To strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
8370,8380,Rescue teams search for survivors on the rubbl...,"[0, 0, 0, 0, 0, 0, 0]"
8371,8381,The launch of ' Happy Birthday ' took place la...,"[0, 0, 0, 0, 0, 0, 0]"
8372,8382,"The unrest has left at least 20,000 people dea...","[0, 0, 0, 0, 0, 0, 0]"
8373,8383,You have to see it from my perspective . I may...,"[0, 0, 0, 0, 0, 0, 0]"


In [38]:
# concat trdf1 and trdf2

new_df = pd.merge(trdf1, trdf2, on='par_id')
new_df = new_df.drop(['community', 'text_x'], axis=1)
new_df = new_df.rename(columns = {'text_y': 'text', 'label_y': 'categories', 'label_x': 'label'})
new_df

Unnamed: 0,par_id,label,text,categories
0,4341,1,"The scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,1,Durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,1,The next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,1,Far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,1,To strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...,...
8370,8380,0,Rescue teams search for survivors on the rubbl...,"[0, 0, 0, 0, 0, 0, 0]"
8371,8381,0,The launch of ' Happy Birthday ' took place la...,"[0, 0, 0, 0, 0, 0, 0]"
8372,8382,0,"The unrest has left at least 20,000 people dea...","[0, 0, 0, 0, 0, 0, 0]"
8373,8383,0,You have to see it from my perspective . I may...,"[0, 0, 0, 0, 0, 0, 0]"


In [39]:
path = '/Users/gouse/PycharmProjects/NLP/labs_2023_2024/CW/data/baseline_preprocessed_csvs'
new_df.to_csv(path+'/trdf1_categories.csv', index=False, encoding='utf-8')