## Set Global Seed and Hyperparameters

In [1]:
import os
import random
import numpy as np
import torch
import transformers

def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed = 559967#random.randint(0,1e6) 
set_all_seeds(seed)

print("The global seed " + str(seed))

The global seed 559967


In [2]:
# MODEL

_LANGUAGE_         = 'en'
_PRETRAINED_LM_    = 'ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 2
_ADAPTER_CONFIG_   = transformers.ParallelConfig(reduction_factor = 256)
_MAX_SEQ_LEN_      = 150

# TRAIN

_OUTPUT_DIR_       = 'checkPoints_FT'
_LOGGING_STEPS_    = 2
_NUM_AUTHORS_      = [8]
_K_FOLD_CV_        = 5
_NO_GPUS_          = 1
_BATCH_SIZE_       = int(32 / _NO_GPUS_)
_EPOCHS_           = {'gender': 30, 'variety': 30}
_LEARNING_RATE_    = 1e-5

# PREDICTIONS
_DATASET_          = 'PAN17'
_PRED_DIR_         = 'gender_FT'

## Base

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 'male':   1}
varietyEN_dict = {'australia': 0, 'canada': 1, 'great britain': 2, 'ireland': 3, 'new zealand': 4, 'united states': 5}
varietyES_dict = {'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}  

genderEN_hip  = {0: 'I’m a female', 1: 'I’m a male'}
genderES_hip  = {0: 'Mi nombre es María', 1: 'Mi nombre es José'}

In [4]:
# SET LANGUAGE DICTIONARY

if _LANGUAGE_ == 'en':
    gender_hip   = genderEN_hip
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    gender_hip   = genderES_hip
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER
from transformers import AutoTokenizer, PretrainedConfig

tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

config             = PretrainedConfig.from_pretrained(_PRETRAINED_LM_)
nli_label2id       = config.label2id
is_encoder_decoder = config.is_encoder_decoder

You are using a model of type bart to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


## Dataset

In [6]:
from tools.DataLoaders import BasePAN17nli, DatasetPAN17nli, DatasetCrossValnli

In [7]:
baseTest  = BasePAN17nli(Dir           = 'data/2017',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      label            = 'gender',
                      label_hip        = gender_hip,
                      nli_label2id     = nli_label2id)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 240000



In [8]:
Test  = DatasetPAN17nli(Base_Dataset = baseTest)

## Compute predictions

In [9]:
tasks = ['gender']

In [10]:
from transformers import AutoModelForSequenceClassification, AutoAdapterModel
from transformers import TrainingArguments, Trainer, AdapterTrainer, EarlyStoppingCallback
from tools.Testing import compute_accuracy, compute_author_predictions
from sklearn.metrics import classification_report, f1_score

model = AutoModelForSequenceClassification.from_pretrained(_PRETRAINED_LM_)

training_args = TrainingArguments(
    learning_rate               = _LEARNING_RATE_,
    num_train_epochs            = _EPOCHS_[tasks[0]],
    per_device_train_batch_size = _BATCH_SIZE_,
    per_device_eval_batch_size  = 200,
    output_dir                  = _OUTPUT_DIR_ + '/' + tasks[0],
    save_total_limit            = 10,
    overwrite_output_dir        = True,
    remove_unused_columns       = False
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = Test,
    eval_dataset    = Test,
)

trainer.args._n_gpu = _NO_GPUS_

ignore_keys = None
if is_encoder_decoder:
    ignore_keys = ['encoder_last_hidden_state']
    
results = trainer.predict(Test, ignore_keys = ignore_keys)

***** Running Prediction *****
  Num examples = 240000
  Batch size = 200


## Test metrics

In [11]:
from tools.Testing import compute_author_predictions_nli

author_predictions = compute_author_predictions_nli(baseTest, results.predictions, 'gender', 2, nli_label2id)

acc: 0.7491666666666666: 100%|██████████████| 2400/2400 [03:08<00:00, 12.73it/s]


In [12]:
report = {'soft': classification_report(author_predictions['true'], author_predictions['pred_soft'], digits=4), 
                       'hard': classification_report(author_predictions['true'], author_predictions['pred_hard'], digits=4)}
print("Results Maria, Jose\n\n")
print(report['soft'])
print(report['hard'])

Results Maria, Jose


              precision    recall  f1-score   support

           0     0.7001    0.8717    0.7765      1200
           1     0.8300    0.6267    0.7142      1200

    accuracy                         0.7492      2400
   macro avg     0.7651    0.7492    0.7453      2400
weighted avg     0.7651    0.7492    0.7453      2400

              precision    recall  f1-score   support

           0     0.6520    0.8742    0.7469      1200
           1     0.8091    0.5333    0.6429      1200

    accuracy                         0.7037      2400
   macro avg     0.7305    0.7037    0.6949      2400
weighted avg     0.7305    0.7037    0.6949      2400

