## Set Global Seed 

In [1]:
import os
import random
import numpy as np
import torch
import transformers

def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed = 260615
set_all_seeds(seed)

print("The global seed " + str(seed))

The global seed 260615


## Hyperparameters

In [2]:
# LANGUAGE

_LANGUAGE_         = 'en'

In [3]:
# MODEL CLASSIFICATION

_PRETRAINED_LM_    = 'ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = transformers.ParallelConfig(reduction_factor = 256)
_MAX_SEQ_LEN_      = 150

In [4]:
# TRAIN

_OUTPUT_DIR_       = 'checkPointsNLI'
_LOGGING_STEPS_    = 50
_NUM_AUTHORS_      = 8
_K_FOLD_CV_        = 5
_NO_GPUS_          = 1
_BATCH_SIZE_       = int(8 / _NO_GPUS_)
_EPOCHS_           = 10
_LEARNING_RATE_    = 1e-8

# PREDICTIONS

_DATASET_          = 'PAN17_NLI'
_PRED_DIR_         = 'NLI_5tweet'

## Other parameters

In [5]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 'male':   1}
varietyEN_dict = {'australia': 0, 'canada': 1, 'great britain': 2, 'ireland': 3, 'new zealand': 4, 'united states': 5}
varietyES_dict = {'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}  

genderEN_hip  = {0: 'I’m a female', 1: 'I’m a male'}
genderES_hip  = {0: 'Mi nombre es María', 1: 'Mi nombre es José'}

In [6]:
# SET LANGUAGE DICTIONARIES

if _LANGUAGE_ == 'en':
    gender_hip   = genderEN_hip
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    gender_hip   = genderES_hip
    variety_dict = varietyES_dict

In [7]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer, PretrainedConfig

tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

config             = PretrainedConfig.from_pretrained(_PRETRAINED_LM_)
nli_label2id       = config.label2id
is_encoder_decoder = config.is_encoder_decoder

You are using a model of type bart to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


## Datasets

In [8]:
from tools.DataLoaders import BasePAN17

baseTrain  = BasePAN17(Dir             = 'data/2017',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = 1,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)

baseTest  = BasePAN17(Dir              = 'data/2017',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = 1,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 360000


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 240000



In [9]:
crossVal_splits = []

for val_idx in range(_K_FOLD_CV_):
    
    authors_train, authors_val = baseTrain.cross_val(_K_FOLD_CV_, val_idx, _NUM_AUTHORS_)
    
    crossVal_splits.append( (authors_train, authors_val) )

In [10]:
from tools.DataLoaders import BasePAN17nli

baseTrain  = BasePAN17nli(Dir             = 'data/2017',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      label            = 'gender',
                      label_hip        = gender_hip,
                      nli_label2id     = nli_label2id)

baseTest  = BasePAN17nli(Dir              = 'data/2017',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      label            = 'gender',
                      label_hip        = gender_hip,
                      nli_label2id     = nli_label2id)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 144000


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done
Merging data...
    Done

Total Instances: 96000



In [11]:
from tools.DataLoaders import DatasetPAN17

Test = DatasetPAN17(baseTest, 'gender')

In [12]:
crossVal_splits[0][0]

['3d8285a6183b250bf7810f1110ebd408',
 '75369e6c54e6b643c7b5112fe484d048',
 'e15ff8259c2b18778594e47a4bce375a',
 'b2e5086a0e2f263f48ba1bec23dcc32',
 '7c61c34e980e22bda49e63f235a08c50',
 '5a61761418a8db2ccdff2b2aacc3a64e',
 'fa0d4331d8a79340d0720556f04dcc79',
 '8bbaf8237695dffe77a19e05d1bdc10c',
 '66eabf9f244ccf162fda0500d9d6891a',
 'ecc242c3785dcfb89f71cafbc2607ead',
 'ba6f1a42a9f0e593eb9a9ed239bae00',
 'd82ec6fc92fccd520194d1ddd14ea2fe',
 'c0fa91662b3c3a014136e483c5041dda',
 'b51a60c4f5dd990bc9975c8fee9f3f1b',
 '9efac3abcc9f592c074fd8f214e2fcf4',
 '5a2567c48d7d3a4fde1beb6f8fcebe3c']

## Training

In [13]:
from transformers import TrainingArguments

samples = 2 * _NUM_AUTHORS_ * int(100 / _TWEET_BATCH_SIZE_)
_LOGGING_STEPS_ = int(samples / _BATCH_SIZE_)

training_args = TrainingArguments(
    learning_rate               = _LEARNING_RATE_,
    num_train_epochs            = _EPOCHS_,
    per_device_train_batch_size = _BATCH_SIZE_,
    per_device_eval_batch_size  = 200,
    logging_steps               = _LOGGING_STEPS_,
    output_dir                  = _OUTPUT_DIR_,
    save_total_limit            = 10,
    overwrite_output_dir        = True,
    remove_unused_columns       = False,
)

In [14]:
from transformers import AutoModelForSequenceClassification
from tools.DataLoaders import DatasetCrossValnli
from transformers import Trainer
from tools.Testing import compute_author_predictions_nli, compute_author_predictions_nli_LR
from sklearn.metrics import f1_score, classification_report
import pickle


# train

task = 'gender'

f1s_soft = []
f1s_hard = []

f1s_soft_LR = []
f1s_hard_LR = []

for split in range( _K_FOLD_CV_ ):
    
    # loaders for current split ------------------------------------------
    
    authors_train, authors_val = crossVal_splits[split]
    
    Train = DatasetCrossValnli(baseTrain, authors_train)
    Val   = DatasetCrossValnli(baseTrain, authors_val)
    
    
    # initialize model ---------------------------------------------------
    
    model = AutoModelForSequenceClassification.from_pretrained(_PRETRAINED_LM_)
    
    
    # create trainer and train -------------------------------------------
        
    trainer = Trainer(
        model           = model,
        args            = training_args,
        train_dataset   = Train,
    )
    trainer.args._n_gpu = _NO_GPUS_

    trainer.train()
    
    
    # get predictions ----------------------------------------------------
    
    
    ignore_keys = None
    if is_encoder_decoder:
        ignore_keys = ['encoder_last_hidden_state']

    results            = trainer.predict(Val , ignore_keys = ignore_keys)
    author_predictions = compute_author_predictions_nli(Val, results.predictions, 'gender', 2, nli_label2id)

    # report metrics 

    report = {'soft': classification_report(author_predictions['true'], author_predictions['pred_soft'], digits=4), 
               'hard': classification_report(author_predictions['true'], author_predictions['pred_hard'], digits=4)}

    f1s_soft.append( f1_score(author_predictions['true'], author_predictions['pred_soft'], average = 'macro') )
    f1s_hard.append( f1_score(author_predictions['true'], author_predictions['pred_hard'], average = 'macro') )
    
    print("Results with split " + str(split + 1) + ":\n")
    print("soft voting:\n", report['soft'], '\n')
    print("hard voting:\n", report['hard'])


    # get predictions with Logistic Regression----------------------------

    resultsTrain = trainer.predict(Train, ignore_keys = ignore_keys)
    author_predictions_LR = compute_author_predictions_nli_LR(Train, Val, resultsTrain.predictions, results.predictions, 'gender', 2)
    
    f1s_soft_LR.append( f1_score(author_predictions_LR['true'], author_predictions_LR['pred_soft'], average = 'macro') )
    f1s_hard_LR.append( f1_score(author_predictions_LR['true'], author_predictions_LR['pred_hard'], average = 'macro') )
    
    # report metrics 

    report_LR = {'soft': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_soft'], digits=4), 
               'hard': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_hard'], digits=4)}

    print("Results with split " + str(split + 1) + " using LOGISTIC REGRESSION:\n")
    print("soft voting:\n", report_LR['soft'], '\n')
    print("hard voting:\n", report_LR['hard'])

     
    
    # save predictions ----------------------------------------------------
    
    DIR = 'results/' + _DATASET_ + '/' + _LANGUAGE_ + '/' + _PRED_DIR_ + '/' + str(_NUM_AUTHORS_) + '_authors/split_' + str(split + 1) + '/'
    if not os.path.exists(DIR):
        os.makedirs(DIR)

    with open(DIR + 'predictions.pickle', 'wb') as f:
        pickle.dump(author_predictions, f)
    
    with open(DIR + 'predictions_LR.pickle', 'wb') as f:
        pickle.dump(author_predictions_LR, f)

    with open(DIR + 'report.txt', 'w') as f:
        f.write("soft voting:\n" + report['soft'] + '\n\n')
        f.write("hard voting:\n" + report['hard'])
        
    with open(DIR + 'report_LR.txt', 'w') as f:
        f.write("soft voting:\n" + report_LR['soft'] + '\n\n')
        f.write("hard voting:\n" + report_LR['hard'])
    


***** Running training *****
  Num examples = 640
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 800


Step,Training Loss
40,8.2148
80,8.0363
120,8.1151
160,7.6248
200,7.4965
240,7.7492
280,7.6718
320,7.0923
360,6.9118
400,7.3531


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-21000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 28800
  Batch size = 200


acc: 0.7458333333333333: 100%|██████████████████████████████████████████████| 720/720 [01:05<00:00, 10.97it/s]
***** Running Prediction *****
  Num examples = 640
  Batch size = 200


Results with split 1:

soft voting:
               precision    recall  f1-score   support

           0     0.6855    0.9083    0.7814       360
           1     0.8642    0.5833    0.6965       360

    accuracy                         0.7458       720
   macro avg     0.7749    0.7458    0.7389       720
weighted avg     0.7749    0.7458    0.7389       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.6680    0.8944    0.7648       360
           1     0.8403    0.5556    0.6689       360

    accuracy                         0.7250       720
   macro avg     0.7542    0.7250    0.7169       720
weighted avg     0.7542    0.7250    0.7169       720



acc: 0.7888888888888889: 100%|██████████████████████████████████████████████| 720/720 [01:04<00:00, 11.17it/s]


Results with split 1 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7430    0.8833    0.8071       360
           1     0.8562    0.6944    0.7669       360

    accuracy                         0.7889       720
   macro avg     0.7996    0.7889    0.7870       720
weighted avg     0.7996    0.7889    0.7870       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.7822    0.7583    0.7701       360
           1     0.7655    0.7889    0.7770       360

    accuracy                         0.7736       720
   macro avg     0.7739    0.7736    0.7736       720
weighted avg     0.7739    0.7736    0.7736       720



loading configuration file https://huggingface.co/ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/bf704e14bcd921d2d4cfcad78a3add263a85a5d067122102d3add0fb620085c7.88e321f78373dda73f5c421340751fd102e1cf513f3e985ac0ca9a0865c4e94a
Model config BartConfig {
  "_name_or_path": "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop":

Step,Training Loss
40,8.6474
80,8.6188
120,8.2412
160,8.3542
200,8.292
240,7.9821
280,7.9803
320,7.7545
360,7.7516
400,7.4793


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 28800
  Batch size = 200


acc: 0.7277777777777777: 100%|██████████████████████████████████████████████| 720/720 [00:44<00:00, 16.34it/s]
***** Running Prediction *****
  Num examples = 640
  Batch size = 200


Results with split 2:

soft voting:
               precision    recall  f1-score   support

           0     0.6660    0.9139    0.7705       360
           1     0.8628    0.5417    0.6655       360

    accuracy                         0.7278       720
   macro avg     0.7644    0.7278    0.7180       720
weighted avg     0.7644    0.7278    0.7180       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.6515    0.9139    0.7607       360
           1     0.8558    0.5111    0.6400       360

    accuracy                         0.7125       720
   macro avg     0.7536    0.7125    0.7003       720
weighted avg     0.7536    0.7125    0.7003       720



acc: 0.7625: 100%|██████████████████████████████████████████████████████████| 720/720 [01:25<00:00,  8.44it/s]


Results with split 2 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7926    0.7111    0.7496       360
           1     0.7380    0.8139    0.7741       360

    accuracy                         0.7625       720
   macro avg     0.7653    0.7625    0.7619       720
weighted avg     0.7653    0.7625    0.7619       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.8408    0.5722    0.6810       360
           1     0.6758    0.8917    0.7689       360

    accuracy                         0.7319       720
   macro avg     0.7583    0.7319    0.7249       720
weighted avg     0.7583    0.7319    0.7249       720



loading configuration file https://huggingface.co/ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/bf704e14bcd921d2d4cfcad78a3add263a85a5d067122102d3add0fb620085c7.88e321f78373dda73f5c421340751fd102e1cf513f3e985ac0ca9a0865c4e94a
Model config BartConfig {
  "_name_or_path": "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop":

Step,Training Loss
40,8.5428
80,8.633
120,8.491
160,8.0966
200,8.072
240,8.0174
280,7.8147
320,7.673
360,7.5225
400,7.5674


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 28800
  Batch size = 200


acc: 0.7180555555555556: 100%|██████████████████████████████████████████████| 720/720 [00:39<00:00, 18.42it/s]
***** Running Prediction *****
  Num examples = 640
  Batch size = 200


Results with split 3:

soft voting:
               precision    recall  f1-score   support

           0     0.6632    0.8861    0.7586       360
           1     0.8285    0.5500    0.6611       360

    accuracy                         0.7181       720
   macro avg     0.7458    0.7181    0.7099       720
weighted avg     0.7458    0.7181    0.7099       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.6447    0.8972    0.7503       360
           1     0.8311    0.5056    0.6287       360

    accuracy                         0.7014       720
   macro avg     0.7379    0.7014    0.6895       720
weighted avg     0.7379    0.7014    0.6895       720



acc: 0.7333333333333333: 100%|██████████████████████████████████████████████| 720/720 [01:20<00:00,  8.92it/s]


Results with split 3 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7485    0.7028    0.7249       360
           1     0.7199    0.7639    0.7412       360

    accuracy                         0.7333       720
   macro avg     0.7342    0.7333    0.7331       720
weighted avg     0.7342    0.7333    0.7331       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.7270    0.6583    0.6910       360
           1     0.6878    0.7528    0.7188       360

    accuracy                         0.7056       720
   macro avg     0.7074    0.7056    0.7049       720
weighted avg     0.7074    0.7056    0.7049       720



loading configuration file https://huggingface.co/ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/bf704e14bcd921d2d4cfcad78a3add263a85a5d067122102d3add0fb620085c7.88e321f78373dda73f5c421340751fd102e1cf513f3e985ac0ca9a0865c4e94a
Model config BartConfig {
  "_name_or_path": "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop":

Step,Training Loss
40,8.1549
80,7.9575
120,7.7985
160,7.5615
200,7.4103
240,7.3805
280,7.1453
320,7.1617
360,6.8536
400,7.0484


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 28800
  Batch size = 200


acc: 0.7277777777777777: 100%|██████████████████████████████████████████████| 720/720 [00:54<00:00, 13.11it/s]
***** Running Prediction *****
  Num examples = 640
  Batch size = 200


Results with split 4:

soft voting:
               precision    recall  f1-score   support

           0     0.6694    0.9000    0.7678       360
           1     0.8475    0.5556    0.6711       360

    accuracy                         0.7278       720
   macro avg     0.7584    0.7278    0.7195       720
weighted avg     0.7584    0.7278    0.7195       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.6519    0.9000    0.7561       360
           1     0.8386    0.5194    0.6415       360

    accuracy                         0.7097       720
   macro avg     0.7452    0.7097    0.6988       720
weighted avg     0.7452    0.7097    0.6988       720



acc: 0.7736111111111111: 100%|██████████████████████████████████████████████| 720/720 [00:34<00:00, 21.12it/s]


Results with split 4 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7872    0.7500    0.7681       360
           1     0.7613    0.7972    0.7788       360

    accuracy                         0.7736       720
   macro avg     0.7742    0.7736    0.7735       720
weighted avg     0.7742    0.7736    0.7735       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.8195    0.6306    0.7127       360
           1     0.6998    0.8611    0.7721       360

    accuracy                         0.7458       720
   macro avg     0.7596    0.7458    0.7424       720
weighted avg     0.7596    0.7458    0.7424       720



loading configuration file https://huggingface.co/ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/bf704e14bcd921d2d4cfcad78a3add263a85a5d067122102d3add0fb620085c7.88e321f78373dda73f5c421340751fd102e1cf513f3e985ac0ca9a0865c4e94a
Model config BartConfig {
  "_name_or_path": "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop":

Step,Training Loss
40,8.207
80,8.3429
120,7.9243
160,8.0719
200,7.6641
240,7.7254
280,7.603
320,7.5282
360,7.303
400,7.3964


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 28800
  Batch size = 200


acc: 0.7458333333333333: 100%|██████████████████████████████████████████████| 720/720 [01:13<00:00,  9.81it/s]
***** Running Prediction *****
  Num examples = 640
  Batch size = 200


Results with split 5:

soft voting:
               precision    recall  f1-score   support

           0     0.6911    0.8889    0.7776       360
           1     0.8444    0.6028    0.7034       360

    accuracy                         0.7458       720
   macro avg     0.7678    0.7458    0.7405       720
weighted avg     0.7678    0.7458    0.7405       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.6758    0.8917    0.7689       360
           1     0.8408    0.5722    0.6810       360

    accuracy                         0.7319       720
   macro avg     0.7583    0.7319    0.7249       720
weighted avg     0.7583    0.7319    0.7249       720



acc: 0.7666666666666667: 100%|██████████████████████████████████████████████| 720/720 [00:42<00:00, 16.80it/s]


Results with split 5 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7945    0.7194    0.7551       360
           1     0.7437    0.8139    0.7772       360

    accuracy                         0.7667       720
   macro avg     0.7691    0.7667    0.7661       720
weighted avg     0.7691    0.7667    0.7661       720
 

hard voting:
               precision    recall  f1-score   support

           0     0.8092    0.6361    0.7123       360
           1     0.7002    0.8500    0.7679       360

    accuracy                         0.7431       720
   macro avg     0.7547    0.7431    0.7401       720
weighted avg     0.7547    0.7431    0.7401       720



In [15]:
# report statistics

print('Soft results: ', f1s_soft)
print('\nHard results: ', f1s_hard)

f1s_soft = np.array(f1s_soft)
f1s_hard = np.array(f1s_hard)

FewShot_Results = {'soft': [f1s_soft.mean(), f1s_soft.std()], 'hard': [f1s_hard.mean(), f1s_hard.std()]}

print('\n\nSoft statistics: ')
print('\t[avg, std]:', FewShot_Results['soft'])

print('\nHard statistics: ')
print('\t[avg, std]:', FewShot_Results['hard'])

Soft results:  [0.7389397100518911, 0.7180104067587981, 0.7098612630245812, 0.7194567257228283, 0.7405231679962818]

Hard results:  [0.7168709633854733, 0.7003468208092485, 0.6894802404722788, 0.6988177274828824, 0.7249270054931459]


Soft statistics: 
	[avg, std]: [0.7253582547108761, 0.012193491619569664]

Hard statistics: 
	[avg, std]: [0.7060885515286058, 0.012905937419082796]


In [16]:
# report statistics

print('Soft results: ', f1s_soft_LR)
print('\nHard results: ', f1s_hard_LR)

f1s_soft_LR = np.array(f1s_soft_LR)
f1s_hard_LR = np.array(f1s_hard_LR)

FewShot_Results_LR = {'soft': [f1s_soft_LR.mean(), f1s_soft_LR.std()], 'hard': [f1s_hard_LR.mean(), f1s_hard_LR.std()]}

print('\n\nSoft statistics with LOGISTIC REGRESSION: ')
print('\t[avg, std]:', FewShot_Results_LR['soft'])

print('\nHard statistics with LOGISTIC REGRESSION: ')
print('\t[avg, std]:', FewShot_Results_LR['hard'])

Soft results:  [0.7869888823144717, 0.7618711450570663, 0.733084129472733, 0.773484832400779, 0.7661451848643968]

Hard results:  [0.7735582572321087, 0.7249270054931459, 0.7048974951860244, 0.7424102316470222, 0.7400828534687818]


Soft statistics with LOGISTIC REGRESSION: 
	[avg, std]: [0.7643148348218893, 0.01779258296506357]

Hard statistics with LOGISTIC REGRESSION: 
	[avg, std]: [0.7371751686054167, 0.022589623083733315]


## Testing

In [21]:
from transformers import AutoModelForSequenceClassification
from tools.DataLoaders import DatasetCrossValnli
from transformers import Trainer
from tools.Testing import compute_author_predictions_nli, compute_author_predictions_nli_LR
from sklearn.metrics import f1_score, classification_report
import pickle


# train

task = 'gender'

split = 0
    
# loaders for current split ------------------------------------------

authors_train, authors_val = crossVal_splits[split]

Train = DatasetCrossValnli(baseTrain, authors_train)
Val   = DatasetCrossValnli(baseTrain, authors_val)


# initialize model ---------------------------------------------------

model = AutoModelForSequenceClassification.from_pretrained(_PRETRAINED_LM_)


# create trainer and train -------------------------------------------

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = Train,
)
trainer.args._n_gpu = _NO_GPUS_

trainer.train()


# get predictions ----------------------------------------------------

ignore_keys = None
if is_encoder_decoder:
    ignore_keys = ['encoder_last_hidden_state']

results            = trainer.predict(Test , ignore_keys = ignore_keys)
author_predictions = compute_author_predictions_nli(baseTest, results.predictions, 'gender', 2, nli_label2id)

# report metrics 

report = {'soft': classification_report(author_predictions['true'], author_predictions['pred_soft'], digits=4), 
           'hard': classification_report(author_predictions['true'], author_predictions['pred_hard'], digits=4)}

print("Results with split " + str(split + 1) + ":\n")
print("soft voting:\n", report['soft'], '\n')
print("hard voting:\n", report['hard'])


# get predictions with Logistic Regression----------------------------

resultsTrain = trainer.predict(Train, ignore_keys = ignore_keys)
author_predictions_LR = compute_author_predictions_nli_LR(Train, baseTest, resultsTrain.predictions, results.predictions, 'gender', 2)

# report metrics 

report_LR = {'soft': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_soft'], digits=4), 
           'hard': classification_report(author_predictions_LR['true'], author_predictions_LR['pred_hard'], digits=4)}

print("Results with split " + str(split + 1) + " using LOGISTIC REGRESSION:\n")
print("soft voting:\n", report_LR['soft'], '\n')
print("hard voting:\n", report_LR['hard'])


loading configuration file https://huggingface.co/ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/bf704e14bcd921d2d4cfcad78a3add263a85a5d067122102d3add0fb620085c7.88e321f78373dda73f5c421340751fd102e1cf513f3e985ac0ca9a0865c4e94a
Model config BartConfig {
  "_name_or_path": "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop":

Step,Training Loss
1280,5.547
2560,1.8439
3840,1.0672
5120,0.9317
6400,0.8697
7680,0.8061
8960,0.7802
10240,0.7649
11520,0.7447
12800,0.7357


Saving model checkpoint to checkPointsNLI/checkpoint-500
Configuration saved in checkPointsNLI/checkpoint-500/config.json
Model weights saved in checkPointsNLI/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-21000] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-1000
Configuration saved in checkPointsNLI/checkpoint-1000/config.json
Model weights saved in checkPointsNLI/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-21500] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-1500
Configuration saved in checkPointsNLI/checkpoint-1500/config.json
Model weights saved in checkPointsNLI/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-22000] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-2000
Configuration saved in checkPointsNLI/checkpoint-2000/config.json
Model weights saved in c

Saving model checkpoint to checkPointsNLI/checkpoint-15000
Configuration saved in checkPointsNLI/checkpoint-15000/config.json
Model weights saved in checkPointsNLI/checkpoint-15000/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-15500
Configuration saved in checkPointsNLI/checkpoint-15500/config.json
Model weights saved in checkPointsNLI/checkpoint-15500/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-10500] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-16000
Configuration saved in checkPointsNLI/checkpoint-16000/config.json
Model weights saved in checkPointsNLI/checkpoint-16000/pytorch_model.bin
Deleting older checkpoint [checkPointsNLI/checkpoint-11000] due to args.save_total_limit
Saving model checkpoint to checkPointsNLI/checkpoint-16500
Configuration saved in checkPointsNLI/checkpoint-16500/config.json
Model weig

acc: 0.7895833333333333: 100%|████████████████████████████████████████████| 2400/2400 [05:10<00:00,  7.73it/s]
***** Running Prediction *****
  Num examples = 20480
  Batch size = 200


Results with split 1:

soft voting:
               precision    recall  f1-score   support

           0     0.7647    0.8367    0.7990      1200
           1     0.8197    0.7425    0.7792      1200

    accuracy                         0.7896      2400
   macro avg     0.7922    0.7896    0.7891      2400
weighted avg     0.7922    0.7896    0.7891      2400
 

hard voting:
               precision    recall  f1-score   support

           0     0.7598    0.8067    0.7825      1200
           1     0.7940    0.7450    0.7687      1200

    accuracy                         0.7758      2400
   macro avg     0.7769    0.7758    0.7756      2400
weighted avg     0.7769    0.7758    0.7756      2400



acc: 0.7895833333333333: 100%|████████████████████████████████████████████| 2400/2400 [05:18<00:00,  7.54it/s]

Results with split 1 using LOGISTIC REGRESSION:

soft voting:
               precision    recall  f1-score   support

           0     0.7870    0.7942    0.7905      1200
           1     0.7923    0.7850    0.7886      1200

    accuracy                         0.7896      2400
   macro avg     0.7896    0.7896    0.7896      2400
weighted avg     0.7896    0.7896    0.7896      2400
 

hard voting:
               precision    recall  f1-score   support

           0     0.7867    0.7683    0.7774      1200
           1     0.7736    0.7917    0.7825      1200

    accuracy                         0.7800      2400
   macro avg     0.7802    0.7800    0.7800      2400
weighted avg     0.7802    0.7800    0.7800      2400




