# Main imports and code

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
import random
import os
from sklearn.metrics import accuracy_score, f1_score


os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [15]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()


print('Cuda available? ',cuda_available)

Cuda available?  True


In [16]:
torch.cuda.set_device(1)

# Fetch Don't Patronize Me! data manager module

In [17]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [18]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [19]:
from dont_patronize_me import DontPatronizeMe

In [20]:
dpm = DontPatronizeMe('./data', './data')

In [21]:
dpm.load_task1()

# Load paragraph IDs

In [22]:
trids = pd.read_csv('./data/train_semeval_parids-labels.csv')
teids = pd.read_csv('./data/dev_semeval_parids-labels.csv')

In [23]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [24]:
data=dpm.train_task1_df

In [25]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [26]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [27]:
trdf1 = pd.DataFrame(rows)

In [28]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set (Task 1)

In [29]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [30]:
len(rows)

2094

In [31]:
tedf1 = pd.DataFrame(rows)

In [32]:
tedf1.head()

Unnamed: 0,par_id,community,text,label
0,4046,hopeless,We also know that they can benefit by receivin...,1
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1


# RoBERTa Baseline for Task 1

In [33]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [34]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [35]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True,
                                      use_multiprocessing= False, 
                                      use_multiprocessing_for_evaluation=False,
                                      process_count= 1)

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available,)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/298 [00:00<?, ?it/s]

  with amp.autocast():
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(298, 0.5328495251702382)

In [36]:
test_preds_task1, _ = task1_model.predict(tedf1.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


In [37]:
train_preds_task1, _ = task1_model.predict(trdf1.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/84 [00:00<?, ?it/s]

In [None]:
# Get true labels
test_true_labels = tedf1['label'].tolist()
train_true_labels = trdf1['label'].tolist()

print(f'Accuracy train: {accuracy_score(train_true_labels, train_preds_task1)}')
print(f'Accuracy test: {accuracy_score(test_true_labels, test_preds_task1)}')
print(f'F1 train: {f1_score(train_true_labels, train_preds_task1)}')
print(f'F1 test: {f1_score(test_true_labels, test_preds_task1)}')

Accuracy train: 0.8494328358208956
Accuracy test: 0.8233046800382043
F1 train: 0.5025641025641026
F1 test: 0.4376899696048632


In [None]:
wrong_predictions_train = trdf1[train_preds_task1 != trdf1['label']][['par_id', 'text', 'label']]
# Add a column for text length and sort by it
wrong_predictions_train = wrong_predictions_train.assign(text_length=wrong_predictions_train.text.str.len())
wrong_predictions_train = wrong_predictions_train.sort_values(by="text_length", ascending=True).drop(columns=["text_length"])

# Print the first sentence along with its label
for row in wrong_predictions_train.iterrows():
    print(f"{row[1]["label"]}: {row[1]["text"]}")



par_id                    6021
text      Pretty much hopeless
label                        0
Name: 6244, dtype: object: par_id                    6021
text      Pretty much hopeless
label                        0
Name: 6244, dtype: object
par_id                     5132
text      Focus on the homeless
label                         0
Name: 5448, dtype: object: par_id                     5132
text      Focus on the homeless
label                         0
Name: 5448, dtype: object
par_id                        4003
text      It just seems hopeless .
label                            0
Name: 4411, dtype: object: par_id                        4003
text      It just seems hopeless .
label                            0
Name: 4411, dtype: object
par_id                           6680
text      Stephens in need of HK lift
label                               0
Name: 6831, dtype: object: par_id                           6680
text      Stephens in need of HK lift
label                               

In [None]:
Counter(test_preds_task1)

Counter({np.int64(0): 1718, np.int64(1): 376})

In [None]:
labels2file([[k] for k in test_preds_task1], 'task1.txt')

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

0
1
0
1
0
0
1
1
0
1


In [None]:
!zip submission.zip task1.txt

  adding: task1.txt (deflated 92%)
