# Paquetes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parametros

In [2]:
# MODEL

_LANGUAGE_         = 'es'
_PRETRAINED_LM_    = 'pysentimiento/robertuito-base-cased'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 1
_ADAPTER_CONFIG_   = transformers.ParallelConfig(mh_adapter = True, reduction_factor = 32)
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'MAM_adapter_checkPoints_es'
_LOGGING_STEPS_    = 50
_TWEETS_PORTION_   = [0.03, 0.04, 0.05]


# TRAIN

_NO_GPUS_          = 2
_BATCH_SIZE_       = int(100 / _NO_GPUS_)
_EPOCHS_           = {'gender': 8, 'variety': 10}
_LEARNING_RATE_    = 5e-4

# Dataset

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
    
vocab = tokenizer.get_vocab()

In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17


baseTest  = BasePAN17(Dir              = '../data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      tweets_portion   = 1.0)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 280000



In [7]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender']

# Model

In [8]:
from transformers import AutoAdapterModel


model = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)

Some weights of the model checkpoint at pysentimiento/robertuito-base-cased were not used when initializing RobertaAdapterModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at pysentimiento/robertuito-base-cased and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

# Training and testing Adapters

In [9]:
from Training import train_model_with_adapters
from TestingPAN17 import test_model_with_adapters

num_v           = len(baseTest.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v,}

accuracy = {}

for portion in _TWEETS_PORTION_:
    # SHOW CURRENT PORTION
    print("Working with portion " + str(int(portion*100)) + "% ... ")
    
    # GENERATES DATASET WITH CURRENT PORTION ----------------------
    baseTrain  = BasePAN17(Dir         = '../data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      tweets_portion   = portion)
    dataset_dict = {}
    
    for task_name in tasks:
        
        dataset_dict[task_name] = DatasetPAN17(Base_Dataset = baseTrain, label = task_name)
        
        # ADD ADAPTER AND CLASSIFICATION HEAD----------------------
        model.add_adapter(
            adapter_name = task_name, 
            config       = _ADAPTER_CONFIG_
        )

        model.add_classification_head(
            head_name    = task_name,
            num_labels   = num_labels_dict[task_name],
          )

        
        # TRAIN ADAPTER--------------------------------------------
        train_model_with_adapters(model     = model,
                              dataset_dict  = dataset_dict,
                              epochs        = _EPOCHS_,
                              batch_size    = _BATCH_SIZE_,
                              no_gpus       = _NO_GPUS_,
                              output_dir    = _OUTPUT_DIR_,
                              logging_steps = _LOGGING_STEPS_,
                              learning_rate = _LEARNING_RATE_)
        
        # SAVE ACCURACY--------------------------------------------
        accuracy[portion] = test_model_with_adapters(model, baseTest, task_name)
        print("accuracy with portion " + str(int(portion*100)) + "% : " + str(accuracy[portion]))
        
        # SAVE ADAPTER AND DELETE----------------------------------
        model.save_adapter("portion" + str(int(portion*100)), task_name)
        model.delete_adapter(task_name)
        model.delete_head(task_name)

Working with portion 3% ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 12598



***** Running training *****
  Num examples = 12598
  Num Epochs = 8
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 1008


Step,Training Loss
50,0.7029
100,0.6613
150,0.6373
200,0.6234
250,0.6256
300,0.5951
350,0.5888
400,0.5553
450,0.5317
500,0.5012


Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-1000
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es

accuracy with portion 3% : 0.7489285714285714
Working with portion 4% ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 16798



Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 16798
  Num Epochs = 8
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 1344


Step,Training Loss
50,0.7049
100,0.669
150,0.654
200,0.6389
250,0.6335
300,0.6334
350,0.6147
400,0.5908
450,0.5888
500,0.5888


Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-1000
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es

accuracy with portion 4% : 0.7607142857142857
Working with portion 5% ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 20998



Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 20998
  Num Epochs = 8
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 1680


Step,Training Loss
50,0.7089
100,0.64
150,0.641
200,0.6579
250,0.6274
300,0.6282
350,0.6111
400,0.6208
450,0.6039
500,0.5802


Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to MAM_adapter_checkPoints_es/gender/checkpoint-1000
Configuration saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/adapter_config.json
Module weights saved in MAM_adapter_checkPoints_es/gender/checkpoint-1000/gender/pytorch_adapter.bin
Configuration saved in MAM_adapter_checkPoints_es

accuracy with portion 5% : 0.7725


In [10]:
accuracy

{0.03: 0.7489285714285714, 0.04: 0.7607142857142857, 0.05: 0.7725}