# Paquetes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parametros

In [2]:
# MODEL

_LANGUAGE_         = 'es'
_PRETRAINED_LM_    = 'pysentimiento/robertuito-base-cased'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = None
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'FT_checkPoints_es'
_LOGGING_STEPS_    = 50
_NUM_AUTHORS_      = [15, 30, 60, 90]


# TRAIN

_NO_GPUS_          = 2
_BATCH_SIZE_       = int(100 / _NO_GPUS_)
_EPOCHS_           = {'gender': 8, 'variety': 10}
_LEARNING_RATE_    = 1e-5

# Dataset

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
    
vocab = tokenizer.get_vocab()

In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17


baseTest  = BasePAN17(Dir              = '../data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      num_authors      = 200)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 56000



In [7]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender']

# Training and testing complete model

In [8]:
from transformers import AutoAdapterModel
from Training import train_models
from TestingPAN17 import test_models

num_v           = len(baseTest.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v,}

accuracy = {}

for num in _NUM_AUTHORS_:
    # SHOW CURRENT PORTION
    print("Working with " + str(num) + " authors per label ... ")
    
    # GENERATES DATASET WITH CURRENT PORTION ----------------------
    baseTrain  = BasePAN17(Dir         = '../data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      num_authors      = num)
    dataset_dict = {}
    models = {}
    
    for task_name in tasks:
        
        dataset_dict[task_name] = DatasetPAN17(Base_Dataset = baseTrain, label = task_name)
        
        # INITIALIZE MODEL-----------------------------------------
        models[task_name] = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)
        models[task_name].add_classification_head(
            head_name    = task_name,
            num_labels   = num_labels_dict[task_name],
        )

        
        # TRAIN ADAPTER--------------------------------------------
        train_models(models     = models,
                              dataset_dict  = dataset_dict,
                              epochs        = _EPOCHS_,
                              batch_size    = _BATCH_SIZE_,
                              no_gpus       = _NO_GPUS_,
                              output_dir    = _OUTPUT_DIR_,
                              logging_steps = _LOGGING_STEPS_,
                              learning_rate = _LEARNING_RATE_)
        
        # SAVE ACCURACY--------------------------------------------
        accuracy[num] = test_models(models, baseTest, task_name)
        print("accuracy with " + str(num) + " authors per label: " + str(accuracy[num]))

Working with 15 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 4200



Some weights of the model checkpoint at pysentimiento/robertuito-base-cased were not used when initializing RobertaAdapterModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at pysentimiento/robertuito-base-cased and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able

Step,Training Loss
50,0.6289
100,0.4958
150,0.4312
200,0.3892
250,0.3486
300,0.3261




Training completed. Do not forget to share your model on huggingface.co/models =)


100%|███████████████████████████████████████████████████████████████████████████████| 2800/2800 [10:46<00:00,  4.33it/s]


accuracy with 15 authors per label: 0.74
Working with 30 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 8400



loading configuration file https://huggingface.co/pysentimiento/robertuito-base-cased/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/3f85c0ee804baf604258892a88dd52cdf051d2418a511dcab7cab99a85a3a1b3.4cce50d5a926bf18fe43f2ea8d4596b505e97a64e6e700e993def66b06f1c83b
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-base-cased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21

Step,Training Loss
50,0.6221
100,0.5441
150,0.5188
200,0.4798
250,0.4484
300,0.4165
350,0.395
400,0.368
450,0.3583
500,0.3314


Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-500
Configuration saved in FT_checkPoints_es/gender/checkpoint-500/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|███████████████████████████████████████████████████████████████████████████████| 2800/2800 [10:30<00:00,  4.44it/s]


accuracy with 30 authors per label: 0.7828571428571428
Working with 60 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 16800



loading configuration file https://huggingface.co/pysentimiento/robertuito-base-cased/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/3f85c0ee804baf604258892a88dd52cdf051d2418a511dcab7cab99a85a3a1b3.4cce50d5a926bf18fe43f2ea8d4596b505e97a64e6e700e993def66b06f1c83b
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-base-cased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21

Step,Training Loss
50,0.643
100,0.5784
150,0.5487
200,0.5205
250,0.5194
300,0.4885
350,0.468
400,0.4472
450,0.4333
500,0.4258


Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-500
Configuration saved in FT_checkPoints_es/gender/checkpoint-500/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-500/pytorch_model.bin
Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-1000
Configuration saved in FT_checkPoints_es/gender/checkpoint-1000/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|███████████████████████████████████████████████████████████████████████████████| 2800/2800 [10:43<00:00,  4.35it/s]


accuracy with 60 authors per label: 0.8207142857142857
Working with 90 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 25200



loading configuration file https://huggingface.co/pysentimiento/robertuito-base-cased/resolve/main/config.json from cache at /001/usuarios/isaac.bribiesca/.cache/huggingface/transformers/3f85c0ee804baf604258892a88dd52cdf051d2418a511dcab7cab99a85a3a1b3.4cce50d5a926bf18fe43f2ea8d4596b505e97a64e6e700e993def66b06f1c83b
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-base-cased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21

Step,Training Loss
50,0.6473
100,0.5858
150,0.5629
200,0.5542
250,0.5357
300,0.5066
350,0.4957
400,0.4943
450,0.4715
500,0.4837


Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-500
Configuration saved in FT_checkPoints_es/gender/checkpoint-500/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-500/pytorch_model.bin
Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-1000
Configuration saved in FT_checkPoints_es/gender/checkpoint-1000/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-1500
Configuration saved in FT_checkPoints_es/gender/checkpoint-1500/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to FT_checkPoints_es/gender/checkpoint-2000
Configuration saved in FT_checkPoints_es/gender/checkpoint-2000/config.json
Model weights saved in FT_checkPoints_es/gender/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|████████

accuracy with 90 authors per label: 0.8310714285714286





In [9]:
accuracy

{15: 0.74,
 30: 0.7828571428571428,
 60: 0.8207142857142857,
 90: 0.8310714285714286}