In [None]:
#dependencies
! python -m pip install nltk
! python -m pip install wordcloud
! python -m pip install Unidecode
! python -m pip install beautifulsoup4

In [None]:
from dpm_preprocessing import DPMProprocessed
from collections import Counter
import torch
#from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaConfig
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import f1_score

device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

model_name = "microsoft/deberta-v2-xlarge"
assert model_name in ['roberta-base', 'bert-base-uncased', 'google/electra-small-discriminator', "microsoft/deberta-v2-xlarge"]

model_path = f'./models/pcl_{model_name}_finetuned/model/'
tokenizer_path = f'./models/pcl_{model_name}_finetuned/tokenizer/'
MAX_SEQ_LEN = 256

WORKING_ENV = 'SERVER' # Can be JONAS, SERVER
assert WORKING_ENV in ['JONAS', 'SERVER','BACKUP']

if WORKING_ENV == 'SERVER':
    temp_model_path = f'/hy-tmp/pcl/{model_name}/'
if WORKING_ENV == 'JONAS': 
    temp_model_path = f'./experiment/pcl/{model_name}/'
if WORKING_ENV == 'BACKUP': 
    temp_model_path = f'/hy-tmp/models/pcl_{model_name}_finetuned/model/'
    temp_tokenizer_path = f'/hy-tmp/models/pcl_{model_name}_finetuned/tokenizer/'


In [None]:
class PCLDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = list(input_set['text'])
        self.labels = list(input_set['label'])
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_SEQ_LEN)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item

In [None]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name , config = config).to(device)


In [None]:
dpm_pp = DPMProprocessed('.', 'task4_test.tsv')


df_type = 'BACKTRANS' # Can be UNBALANCED, BACKTRANS, OVERSAMPLING
assert df_type in ['UNBALANCED', 'BACKTRANS', 'OVERSAMPLING']

if df_type == 'UNBALANCED':
    train_df_path = 'traindf.pickle'
    val_df_path = 'valdf.pickle'
if df_type == 'BACKTRANS': 
    train_df_path = 'traindf_backtrans.pickle'
    val_df_path = 'valdf_backtrans.pickle'


if not os.path.isfile(train_df_path) or not os.path.isfile(val_df_path):
  train_df, val_df = dpm_pp.get_unbalanced_split(0.1)
  train_df.to_pickle('traindf.pickle')
  val_df.to_pickle('valdf.pickle')
else:
  train_df = pd.read_pickle(train_df_path)
  val_df = pd.read_pickle(val_df_path)

print("Training set length: ",len(train_df))
print("Validation set length: ",len(val_df))

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # weight_scale = len(train_df[train_df['label']==0])/len(train_df[train_df['label']==1])
        weight_scale = 1
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, weight_scale]).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return ((loss, outputs) if return_outputs else loss)

In [None]:
validation_loader = DataLoader(eval_dataset)
def compute_metric_eval(arg):
    logits, labels_gold = arg[0], arg[1]
    labels_pred = np.argmax(logits, axis = 1)
    return {'f1_macro' :f1_score(labels_gold, labels_pred, average='macro'), 
            'pcl_f1': classification_report(labels_gold, labels_pred, target_names=["Not PCL","PCL"], output_dict= True)['PCL']['f1-score']} #more metrics can be added

training_args = TrainingArguments(
        output_dir=temp_model_path,
        learning_rate = 1e-6,
        logging_steps= 100,
        eval_steps = 500,
        per_device_train_batch_size=4,
        per_device_eval_batch_size =4,
        num_train_epochs = 4,
        evaluation_strategy= "steps",
        load_best_model_at_end=True,
        metric_for_best_model='pcl_f1'
        )

trainer = CustomTrainer(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,                   
        data_collator=eval_dataset.collate_fn,
        compute_metrics = compute_metric_eval,
        eval_dataset = eval_dataset
    )
trainer.train()

In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)

train_df.to_pickle('train_df.pickle')
val_df.to_pickle('val_df.pickle')

In [None]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path , config = config).to(device)


In [None]:

train_df = pd.read_pickle('train_df.pickle')
val_df = pd.read_pickle('val_df.pickle')

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)

In [None]:
def predict_pcl(input, tokenizer, model, threshold=0.5):
    model.eval()
    encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=256)
    encodings = encodings.to(device)
    output = model(**encodings)
    logits = output.logits
    prob = nn.functional.softmax(logits)[: , 1].cpu()
    preds = np.array([int(prob>threshold)])
    return {'prediction': preds, 'confidence': prob}


def evaluate(model, tokenizer, data_loader, threshold=0.5):
    preds = []
    tot_labels = []
    confidences = []
    with torch.no_grad():
        for data in (data_loader):
            labels = {}
            labels['label'] = data['label']

            tweets = data['text']

            pred = predict_pcl(tweets, tokenizer, model, threshold)

            preds.append(np.array(pred['prediction']))
            tot_labels.append(np.array(labels['label'].cpu()))
            confidences.append(np.array(pred['confidence'].cpu()))

    # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score

    return preds, tot_labels, confidences

In [None]:
validation_loader = DataLoader(eval_dataset)

preds, tot_labels, confidences = evaluate(model, tokenizer, validation_loader)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
confidences = np.array(confidences)
report = classification_report(tot_labels, preds, target_names=["Not PCL", "PCL"], output_dict=True)
print(report)

print(report['accuracy'])
print(report['Not PCL']['f1-score'])
print(report['PCL']['f1-score'])

In [None]:
confidences

In [None]:
# define threshold
pcl_count_by_threshold = []
non_pcl_count_by_threshold = []
f1_by_threshold = []
precision_by_threshold = []
recall_by_threshold = []
for percentage in range(100):
    threshold = percentage / 100
    pcl_count = (confidences > threshold).sum()
    non_pcl_count = (confidences <= threshold).sum()
    pred = np.zeros(tot_labels.shape)
    pred[confidences > threshold] = 1
    pcl_count_by_threshold.append(pcl_count)
    non_pcl_count_by_threshold.append(non_pcl_count)
    f1_by_threshold.append(classification_report(tot_labels, pred, target_names=["Not PCL", "PCL"], output_dict=True)['PCL']['f1-score'])
    precision_by_threshold.append(classification_report(tot_labels, pred, target_names=["Not PCL", "PCL"], output_dict=True)['PCL']['precision'])
    recall_by_threshold.append(classification_report(tot_labels, pred, target_names=["Not PCL", "PCL"], output_dict=True)['PCL']['recall'])


best_threshold = np.argmax(f1_by_threshold) / 100

In [None]:
# best_threshold
np.max(f1_by_threshold)

In [None]:
import matplotlib.pyplot as plt

x=np.arange(0,100)
l1=plt.plot(x,pcl_count_by_threshold,'r',label='pcl_count_by_threshold')
l2=plt.plot(x,non_pcl_count_by_threshold,'g',label='non_pcl_count_by_threshold')

# plt.plot(x1,y1,'ro-',x2,y2,'g+-',x3,y3,'b^-')
plt.title('Frequency by threshold')
plt.xlabel('threshold')
plt.ylabel('column')
plt.legend()
plt.show()

plt.clf()
l3=plt.plot(x, f1_by_threshold,'r',label='f1_by_threshold')
l4=plt.plot(x, precision_by_threshold,'g',label='precision_by_threshold')
l5=plt.plot(x, recall_by_threshold,'b',label='recall_by_threshold')
plt.title('Metrics by threshold')
plt.xlabel('threshold')
plt.ylabel('column')
plt.legend()
plt.show()

# Test set

In [None]:
dpm_pp.load_test()
test_df = dpm_pp.test_set_df
test_df['label'] = 0
test_dataset = PCLDataset(tokenizer, test_df)

test_loader = DataLoader(test_dataset)

preds, tot_labels, confidences = evaluate(model, tokenizer, test_loader, 0.5)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
# report = classification_report(tot_labels, preds, target_names=["Not PCL","PCL"], output_dict= True)
# print(report)

# print(report['accuracy'])
# print(report['Not PCL']['f1-score'])
# print(report['PCL']['f1-score'])

In [None]:
# preds.shape
preds.shape

In [None]:

preds = preds.reshape(-1)
Counter(preds)

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
labels2file([[k] for k in preds], 'task1.txt')
!cat task1.txt | head -n 10
!zip submission.zip task1.txt 

In [None]:
best_threshold

## Ensemble deberta and roberta

In [1]:
from dpm_preprocessing import DPMProprocessed
import torch
#from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaConfig
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import f1_score
from collections import Counter
device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")
from copy import deepcopy
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

Looking in indexes: https://mirrors.aliyun.com/pypi/simple


You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.


Looking in indexes: https://mirrors.aliyun.com/pypi/simple


You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PCLDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = list(input_set['text'])
        self.labels = list(input_set['label'])
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_SEQ_LEN)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item

In [3]:
dpm_pp = DPMProprocessed('.', 'task4_test.tsv')


df_type = 'BACKTRANS' # Can be UNBALANCED, BACKTRANS, OVERSAMPLING
assert df_type in ['UNBALANCED', 'BACKTRANS', 'OVERSAMPLING']

if df_type == 'UNBALANCED':
    train_df_path = 'traindf.pickle'
    val_df_path = 'valdf.pickle'
if df_type == 'BACKTRANS': 
    train_df_path = 'traindf_backtrans.pickle'
    val_df_path = 'valdf_backtrans.pickle'


if not os.path.isfile(train_df_path) or not os.path.isfile(val_df_path):
  train_df, val_df = dpm_pp.get_unbalanced_split(0.1)
  train_df.to_pickle('traindf.pickle')
  val_df.to_pickle('valdf.pickle')
else:
  train_df = pd.read_pickle(train_df_path)
  val_df = pd.read_pickle(val_df_path)

print("Training set length: ",len(train_df))
print("Validation set length: ",len(val_df))

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}
      par_id      art_id     keyword country  \
0          1  @@24942188    hopeless      ph   
1          2  @@21968160     migrant      gh   
2          3  @@16584954   immigrant      ie   
3          4   @@7811231    disabled      nz   
4          5   @@1494111     refugee      ca   
...      ...         ...         ...     ...   
10464  10465  @@14297363       women      lk   
10465  10466  @@70091353  vulnerable      ph   
10466  10467  @@20282330     in-need      ng   
10467  10468  @@16753236    hopeless      in   
10468  10469  @@16779383    homeless      ie   

                                                    text  label orig_label  \
0      We are living in times of absolute insanity , ...      0          0   
1      In Libya today , there are countless number of...      0         

In [4]:
def predict_pcl(input, tokenizer, model, threshold=0.5):
    model.eval()
    encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=256)
    encodings = encodings.to(device)
    output = model(**encodings)
    logits = output.logits
    prob = nn.functional.softmax(logits)[: , 1].cpu()
    preds = np.array([int(prob>threshold)])
    return {'prediction': preds, 'confidence': prob}


def evaluate(model, tokenizer, data_loader, threshold=0.5):
    preds = []
    tot_labels = []
    confidences = []
    with torch.no_grad():
        for data in (data_loader):
            labels = {}
            labels['label'] = data['label']

            tweets = data['text']

            pred = predict_pcl(tweets, tokenizer, model, threshold)

            preds.append(np.array(pred['prediction']))
            tot_labels.append(np.array(labels['label'].cpu()))
            confidences.append(np.array(pred['confidence'].cpu()))

    # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score

    return preds, tot_labels, confidences

# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [5]:
# Path of the best deberta
deberta_model_name = "microsoft/deberta-v2-xlarge"
deberta_model_path = f'/hy-tmp/models/pcl_{deberta_model_name}_finetuned/model/'
deberta_tokenizer_path = f'/hy-tmp/models/pcl_{deberta_model_name}_finetuned/tokenizer/'
# Path of the roberta
roberta_model_name = "roberta-base"
roberta_model_path = f'./models/pcl_{roberta_model_name}_finetuned/model/'
roberta_tokenizer_path = f'./models/pcl_{roberta_model_name}_finetuned/tokenizer/'
# Path for bert
bert_model_name = "bert-base-uncased"
bert_model_path = f'./models/pcl_{bert_model_name}_finetuned/model/'
bert_tokenizer_path = f'./models/pcl_{bert_model_name}_finetuned/tokenizer/'

In [6]:
# Read the tokenizer and the model seperately
config = AutoConfig.from_pretrained(deberta_model_name)
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_tokenizer_path)
deberta_model = AutoModelForSequenceClassification.from_pretrained(deberta_model_path , config = config).to(device)

In [7]:
# Read the tokenizer and the model seperately
config = AutoConfig.from_pretrained(bert_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_tokenizer_path)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_path , config = config).to(device)

In [8]:
config = AutoConfig.from_pretrained(roberta_model_name)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_tokenizer_path)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_path , config = config).to(device)

In [9]:
# Predict use deberta
train_dataset = PCLDataset(deberta_tokenizer, train_df)
eval_dataset = PCLDataset(deberta_tokenizer, val_df)
dpm_pp.load_test()
test_df = dpm_pp.test_set_df
test_df['label'] = 0
test_dataset = PCLDataset(deberta_tokenizer, test_df)
test_loader = DataLoader(test_dataset)
preds, tot_labels, confidences = evaluate(deberta_model, deberta_tokenizer, test_loader, 0.65)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
deberta_preds = preds.reshape(-1)


  prob = nn.functional.softmax(logits)[: , 1].cpu()


In [22]:
# Predict use roberta
train_dataset = PCLDataset(roberta_tokenizer, train_df)
eval_dataset = PCLDataset(roberta_tokenizer, val_df)
dpm_pp.load_test()
test_df = dpm_pp.test_set_df
test_df['label'] = 0
test_dataset = PCLDataset(roberta_tokenizer, test_df)
test_loader = DataLoader(test_dataset)
preds, tot_labels, confidences = evaluate(roberta_model, roberta_tokenizer, test_loader, 0.65)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
roberta_preds = preds.reshape(-1)

  prob = nn.functional.softmax(logits)[: , 1].cpu()


In [23]:
# Predict use bert
train_dataset = PCLDataset(bert_tokenizer, train_df)
eval_dataset = PCLDataset(bert_tokenizer, val_df)
dpm_pp.load_test()
test_df = dpm_pp.test_set_df
test_df['label'] = 0
test_dataset = PCLDataset(bert_tokenizer, test_df)
test_loader = DataLoader(test_dataset)
preds, tot_labels, confidences = evaluate(bert_model, bert_tokenizer, test_loader, 0.65)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
bert_preds = preds.reshape(-1)

  prob = nn.functional.softmax(logits)[: , 1].cpu()


In [24]:
Counter(roberta_preds)

Counter({0: 3593, 1: 239})

In [25]:
Counter(deberta_preds)

Counter({0: 3517, 1: 315})

In [26]:
Counter(bert_preds)

Counter({0: 3649, 1: 183})

In [33]:
final_pred = np.ones(len(deberta_preds),dtype=int)
for i in range(len(deberta_preds)):
    if deberta_preds[i]==0 and bert_preds[i]==0:
        final_pred[i] =0
Counter(final_pred)

Counter({0: 3482, 1: 350})

In [36]:
labels2file([[k] for k in final_pred], 'task1.txt')
!zip submission.zip task1.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
updating: task1.txt (deflated 95%)
