## Set Global Seed 

In [1]:
import os
import random
import numpy as np
import torch
import transformers

def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed = 260615
set_all_seeds(seed)

print("The global seed " + str(seed))

The global seed 260615


## Hyperparameters

In [2]:
# LANGUAGE

_LANGUAGE_         = 'en'
_DATASET_          = '2017'

In [3]:
# MODEL CLASSIFICATION

_PRETRAINED_LM_    = 'ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = transformers.HoulsbyConfig()
_MAX_SEQ_LEN_      = 150

In [4]:
# TRAIN

_OUTPUT_DIR_       = 'checkPointsHoulsbyNLIall'
_LOGGING_STEPS_    = 50
_NUM_AUTHORS_      = 64
_K_FOLD_CV_        = 5
_NO_GPUS_          = 1
_BATCH_SIZE_       = int(8 / _NO_GPUS_)
_EPOCHS_           = 10
_LEARNING_RATE_    = 1e-5

# PREDICTIONS

_PRED_DIR_         = 'Houlsby'

## Other parameters

In [5]:
# LABEL DICTONARIES -----------------------------------------------------------------------

# 2015

age_dict  = {'18-24': 0, '25-34': 1, '35-49': 2, '50-XX': 3}
ageEN_hyp = {0: '18-24', 1: '25-34', 2: '35-49', 3: '50-XX'}
ageES_hyp = {0: 'La edad de esta persona es entre 18 y 24 años', 
             1: 'La edad de esta persona es entre 25 y 34 años', 
             2: 'La edad de esta persona es entre 35 y 49 años', 
             3: 'La edad de esta persona es más de 50 años'}

# 2017

gender_dict    = {'female': 0, 'male':   1}
varietyEN_dict = {'australia': 0, 'canada': 1, 'great britain': 2, 'ireland': 3, 'new zealand': 4, 'united states': 5}
varietyES_dict = {'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}  

genderEN_hyp  = {0: 'I’m a female', 1: 'I’m a male'}
genderES_hyp  = {0: 'Mi nombre es María', 1: 'Mi nombre es José'}

# 2019

bots_dict  = {'human': 0, 'bot': 1}
botsEN_hyp = {0: 'This is a text from a person', 1: 'This is a text from a machine'}
botsES_hyp = {0: 'Humano', 1: 'Bot'}

# 2020 

fakeNews_dict  = {'0': 0, '1': 1}
fakeNewsEN_hyp = {0: 'This author is a normal user', 1: 'This author spreads fake news'}
fakeNewsES_hyp = {0: 'Este autor es un usuario normal', 1: 'Este autor publica noticias falsas'}

# 2021

hateSpeech_dict  = {'0': 0, '1': 1}
hateSpeechEN_hyp = {0: 'This text does not contain hate speech', 1: 'This text expresses prejudice and hate speech'}
hateSpeechES_hyp = {0: 'Este texto es moderado, respetuoso, cortés y civilizado', 1: 'Este texto expresa odio o prejuicios'}

In [6]:
# SET LANGUAGE DICTIONARIES --------------------------------------------------

if _LANGUAGE_ == 'en':
    age_hyp        = ageEN_hyp
    gender_hyp     = genderEN_hyp
    variety_dict   = varietyEN_dict
    fakeNews_hyp   = fakeNewsEN_hyp
    hateSpeech_hyp = hateSpeechEN_hyp
    bots_hyp       = botsEN_hyp 

elif _LANGUAGE_ == 'es':
    age_hyp        = ageES_hyp
    gender_hyp     = genderES_hyp
    variety_dict   = varietyES_dict
    fakeNews_hyp   = fakeNewsES_hyp
    hateSpeech_hyp = hateSpeechES_hyp
    bots_hyp       = botsES_hyp
    
    
# SET LANGUAGE AND DATASET PARAMETERS ----------------------------------------
    
if   _DATASET_ == '2015':
    label_idx  = 2
    class_dict = age_dict
    label_name = 'age'
    label_hyp  = age_hyp
    
elif _DATASET_ == '2017':
    label_idx  = 1
    class_dict = gender_dict
    label_name = 'gender'
    label_hyp  = gender_hyp
    
elif _DATASET_ == '2019':
    label_idx  = 1
    class_dict = bots_dict
    label_name = 'bots'
    label_hyp  = bots_hyp
    
elif _DATASET_ == '2020':
    label_idx  = 1
    class_dict = fakeNews_dict
    label_name = 'fakeNews'
    label_hyp  = fakeNews_hyp
    
elif _DATASET_ == '2021':
    label_idx  = 1
    class_dict = hateSpeech_dict
    label_name = 'hateSpeech'
    label_hyp  = hateSpeech_hyp
    

In [7]:
from transformers import AutoTokenizer, PretrainedConfig

tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

config             = PretrainedConfig.from_pretrained(_PRETRAINED_LM_)
nli_label2id       = config.label2id
is_encoder_decoder = config.is_encoder_decoder

You are using a model of type bart to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


## Datasets

In [8]:
# GET AUTHORS AND LABELS -----------------------------------------------------

from tools.DataLoaders import BasePAN

baseTrain  = BasePAN(Dir        = 'data/' + _DATASET_,
                     split      = 'train',
                     language   = _LANGUAGE_,
                     label_idx  = label_idx,
                     class_dict = class_dict,
                     label_name = label_name)

baseTest   = BasePAN(Dir        = 'data/' + _DATASET_,
                     split      = 'test',
                     language   = _LANGUAGE_,
                     label_idx  = label_idx,
                     class_dict = class_dict,
                     label_name = label_name)

In [9]:
# GET K-FOLD SPLITS -----------------------------------------------------

crossVal_splits = baseTrain.cross_val(_K_FOLD_CV_, _NUM_AUTHORS_)

crossVal_splits[0][0]

['5fb799c39e1c8a92f6c580ff516bdc',
 'c3fc612f9fe498210613a7991a1c7f2a',
 '7a2b6e351032615aa3c19d8c252bd552',
 '5849edbc23ee81353911f951e0903ea',
 '42dc691f860354271b77171b152967cf',
 '7ffdaa0d7d70f8c485e3f645aa9ba7d9',
 '49a6b94266d28dd3e241905d2300e504',
 'ad6faf0d1624a7015f0eab10a6516fc2',
 'c93fd1e3c9e5ba437a544db7b84e375e',
 '5fc0d0a6f90891a3cb4b32d7169e6676',
 '60705218ce29389bb7181837d701556e',
 '56b4ff0baba162a5c53323fe3dcd0d64',
 'a9430efa05e5abc56310ff9a5173cd07',
 '87318fe9b132b934dd298c3f7f3fbcf6',
 'd052413e23540da3e613f466e53a1317',
 '1342f1fee84567cda4741b41a770b4e0',
 '1de5e53b813163d9c5105cd15ccaa842',
 '4dbd49cb6fe542993bfd0b968be669bd',
 'e9cd1a184a4c34f480748df1bd3d2bf6',
 '505da57fd58bb00c6a164789c9658c58',
 'b123667f9d6697eb2828dc00920099fc',
 'b89bdf194952b1e65de3ba5d8cf35305',
 '47a9df75b78c187edebc2a740b118dc8',
 'ad1b99bd41ef75f70cd2e8406e254015',
 '31e8a4b69e210ed705c0901d41818ad',
 'f860e1654982c95af3c7da43c3d0ef94',
 '7fd945221f3a712a1223810e2b1ddc10',
 'b2e

In [10]:
# GET TWEETS -----------------------------------------------------

baseTrain.get_all_data(_TWEET_BATCH_SIZE_, tokenizer, _MAX_SEQ_LEN_, _PREPROCESS_TEXT_, NLI=True, label_hyp=label_hyp, nli_label2id=nli_label2id)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 144000



## Training with all

In [11]:
from tools.DataLoaders import DatasetPANnli

baseTest.get_all_data(_TWEET_BATCH_SIZE_, tokenizer, _MAX_SEQ_LEN_, _PREPROCESS_TEXT_, NLI=True, label_hyp=label_hyp, nli_label2id=nli_label2id)

Test  = DatasetPANnli(baseTest)
Train = DatasetPANnli(baseTrain)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 96000



In [12]:
from transformers import TrainingArguments

samples = 4 * _NUM_AUTHORS_ * int(100 / _TWEET_BATCH_SIZE_)
_LOGGING_STEPS_ = int(samples / _BATCH_SIZE_)

training_args = TrainingArguments(
    learning_rate               = _LEARNING_RATE_,
    num_train_epochs            = _EPOCHS_,
    per_device_train_batch_size = _BATCH_SIZE_,
    per_device_eval_batch_size  = 200,
    logging_steps               = _LOGGING_STEPS_,
    output_dir                  = _OUTPUT_DIR_,
    save_total_limit            = 5,
    overwrite_output_dir        = True,
    remove_unused_columns       = False,
)

In [None]:
from transformers import AutoAdapterModel, AutoModelForSequenceClassification
from tools.DataLoaders import DatasetCrossValnli
from transformers import AdapterTrainer
from tools.Testing import compute_author_predictions_nli, compute_author_predictions_nli_LR
from sklearn.metrics import f1_score, classification_report
import pickle


# train

task = label_name

# initialize model ---------------------------------------------------


#static_head_model = AutoModelForSequenceClassification.from_pretrained(_PRETRAINED_LM_)
#static_head_model.add_adapter(task, config = _ADAPTER_CONFIG_) 
#static_head_model.save_adapter('aux_adapter', task)

model = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)
#model.load_adapter('aux_adapter', load_as=task)
model.load_adapter('pretrained_adapter_NLI', load_as=task)

model.set_active_adapters(task)
model.train_adapter(task)


# create trainer and train -------------------------------------------

trainer = AdapterTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = Train,
)
trainer.args._n_gpu = _NO_GPUS_

trainer.train()


# get predictions ----------------------------------------------------


ignore_keys = None
if is_encoder_decoder:
    ignore_keys = ['encoder_last_hidden_state']

results            = trainer.predict(Test , ignore_keys = ignore_keys)
author_predictions = compute_author_predictions_nli(baseTest, results.predictions, task, len(class_dict), nli_label2id)

# report metrics 

report = {'soft': classification_report(author_predictions['true'], author_predictions['pred_soft'], digits=4), 
           'hard': classification_report(author_predictions['true'], author_predictions['pred_hard'], digits=4)}

print("Results:\n")
print("soft voting:\n", report['soft'], '\n')
print("hard voting:\n", report['hard'])


# get predictions with Logistic Regression----------------------------


resultsTrain = trainer.predict(Train, ignore_keys = ignore_keys)
author_predictions_LR = compute_author_predictions_nli_LR(baseTrain, baseTest, resultsTrain.predictions, results.predictions, task, len(class_dict))

# report metrics 

report_LR = {'soft': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_soft'], digits=4), 
           'hard': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_hard'], digits=4)}

print("Results using LOGISTIC REGRESSION:\n")
print("soft voting:\n", report_LR['soft'], '\n')
print("hard voting:\n", report_LR['hard'])



Some weights of the model checkpoint at ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing BartAdapterModel: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
- This IS expected if you are initializing BartAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running training *****
  Num examples = 144000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps =

Step,Training Loss
640,0.5114
1280,0.5113
1920,0.5176
2560,0.5118
3200,0.5181
3840,0.5137
4480,0.506
5120,0.5017
5760,0.5156
6400,0.5095


Saving model checkpoint to checkPointsHoulsbyNLIall/checkpoint-500
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/adapter_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to checkPointsHoulsbyNLIall/checkpoint-1000
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-1000/gender/adapter_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-1000/gender/pytorch_adapter.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-1000/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/ch

Module weights saved in checkPointsHoulsbyNLIall/checkpoint-6500/gender/pytorch_adapter.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-6500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-6500/gender/pytorch_model_head.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-6500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-6500/gender/pytorch_model_head.bin
Deleting older checkpoint [checkPointsHoulsbyNLIall/checkpoint-4000] due to args.save_total_limit
Saving model checkpoint to checkPointsHoulsbyNLIall/checkpoint-7000
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-7000/gender/adapter_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-7000/gender/pytorch_adapter.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-7000/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-7000/gender/pytorch_model_head.bin
Configurat

Module weights saved in checkPointsHoulsbyNLIall/checkpoint-12000/gender/pytorch_model_head.bin
Deleting older checkpoint [checkPointsHoulsbyNLIall/checkpoint-9500] due to args.save_total_limit
Saving model checkpoint to checkPointsHoulsbyNLIall/checkpoint-12500
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/adapter_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/pytorch_adapter.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/pytorch_model_head.bin
Configuration saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/head_config.json
Module weights saved in checkPointsHoulsbyNLIall/checkpoint-12500/gender/pytorch_model_head.bin
Deleting older checkpoint [checkPointsHoulsbyNLIall/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to checkPointsHoulsbyNLIall/checkpoint-13000
Configuration 

In [17]:

print("Results:\n")
print("soft voting:\n", report['soft'], '\n')
print("hard voting:\n", report['hard'])

Results:

soft voting:
               precision    recall  f1-score   support

           0     0.7814    0.8908    0.8326      1200
           1     0.8731    0.7508    0.8073      1200

    accuracy                         0.8208      2400
   macro avg     0.8272    0.8208    0.8200      2400
weighted avg     0.8272    0.8208    0.8200      2400
 

hard voting:
               precision    recall  f1-score   support

           0     0.7691    0.8992    0.8290      1200
           1     0.8786    0.7300    0.7975      1200

    accuracy                         0.8146      2400
   macro avg     0.8239    0.8146    0.8132      2400
weighted avg     0.8239    0.8146    0.8132      2400



In [18]:

print("Results using LOGISTIC REGRESSION:\n")
print("soft voting:\n", report_LR['soft'], '\n')
print("hard voting:\n", report_LR['hard'])


Results using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.8076    0.8500    0.8283      1200
           1     0.8417    0.7975    0.8190      1200

    accuracy                         0.8237      2400
   macro avg     0.8246    0.8237    0.8236      2400
weighted avg     0.8246    0.8237    0.8236      2400
 

hard voting:
               precision    recall  f1-score   support

           0     0.7943    0.8592    0.8255      1200
           1     0.8466    0.7775    0.8106      1200

    accuracy                         0.8183      2400
   macro avg     0.8205    0.8183    0.8180      2400
weighted avg     0.8205    0.8183    0.8180      2400



In [None]:
model.save_adapter('Pretrained_Adapters_NLI/'+_DATASET_+'_'+_PRED_DIR_, task)