# Paquetes

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parametros

In [3]:
# MODEL

_LANGUAGE_         = 'es'
_PRETRAINED_LM_    = 'pysentimiento/robertuito-base-cased'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = transformers.IA3Config() #transformers.ParallelConfig(mh_adapter = True, reduction_factor = 64)
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'parallel_adapter_checkPoints_es'
_LOGGING_STEPS_    = 2
_NUM_AUTHORS_      = [1, 3, 6, 12, 15, 30, 60, 90]


# TRAIN

_NO_GPUS_          = 1
_BATCH_SIZE_       = int(100 / _NO_GPUS_)
_EPOCHS_           = {'gender': 8, 'variety': 10}
_LEARNING_RATE_    = 1e-4

# Dataset

In [4]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [5]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [6]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
    
vocab = tokenizer.get_vocab()

In [7]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17


baseTest  = BasePAN17(Dir              = '../data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      num_authors      = 200)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 56000



In [8]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender']

# Model

In [9]:
from transformers import AutoAdapterModel


model = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)

Some weights of the model checkpoint at pysentimiento/robertuito-base-cased were not used when initializing RobertaAdapterModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at pysentimiento/robertuito-base-cased and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able

# Training and testing Adapters

In [10]:
from Training import train_model_with_adapters
from TestingPAN17 import test_model_with_adapters

num_v           = len(baseTest.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v,}

accuracy = {}

for num in _NUM_AUTHORS_:
    # SHOW CURRENT PORTION
    print("Working with " + str(num) + " authors per label ... ")
    
    # GENERATES DATASET WITH CURRENT PORTION ----------------------
    baseTrain  = BasePAN17(Dir         = '../data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_,
                      num_authors      = num)
    dataset_dict = {}
    
    for task_name in tasks:
        
        dataset_dict[task_name] = DatasetPAN17(Base_Dataset = baseTrain, label = task_name)
        
        # ADD ADAPTER AND CLASSIFICATION HEAD----------------------
        model.add_adapter(
            adapter_name = task_name, 
            config       = _ADAPTER_CONFIG_
        )

        model.add_classification_head(
            head_name    = task_name,
            num_labels   = num_labels_dict[task_name],
          )

        
        # TRAIN ADAPTER--------------------------------------------
        train_model_with_adapters(model     = model,
                              dataset_dict  = dataset_dict,
                              epochs        = _EPOCHS_,
                              batch_size    = _BATCH_SIZE_,
                              no_gpus       = _NO_GPUS_,
                              output_dir    = _OUTPUT_DIR_,
                              logging_steps = _LOGGING_STEPS_,
                              learning_rate = _LEARNING_RATE_)
        
        # SAVE ACCURACY--------------------------------------------
        accuracy[num] = test_model_with_adapters(model, baseTest, task_name)
        print("accuracy with " + str(num) + " authors per label: " + str(accuracy[num]))
        
        # SAVE ADAPTER AND DELETE----------------------------------
        model.save_adapter("num_authors" + str(num), task_name)
        model.delete_adapter(task_name)
        model.delete_head(task_name)

Working with 1 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 280



***** Running training *****
  Num examples = 280
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 24


Step,Training Loss
2,0.7048
4,0.628
6,0.5798
8,0.5156
10,0.5091
12,0.4939
14,0.4516
16,0.4261
18,0.4248
20,0.4004




Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.635: 100%|███████████████████████████████████████████████████████████████████| 2800/2800 [11:55<00:00,  3.92it/s]
Configuration saved in num_authors1/adapter_config.json
Module weights saved in num_authors1/pytorch_adapter.bin
Configuration saved in num_authors1/head_config.json
Module weights saved in num_authors1/pytorch_model_head.bin


accuracy with 1 authors per label: 0.635
Working with 3 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 840
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 72


    Done

Total Instances: 840



Step,Training Loss
2,0.7188
4,0.6654
6,0.6433
8,0.5911
10,0.5684
12,0.5184
14,0.5295
16,0.5311
18,0.5284
20,0.5119




Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.67: 100%|████████████████████████████████████████████████████████████████████| 2800/2800 [12:02<00:00,  3.87it/s]
Configuration saved in num_authors3/adapter_config.json
Module weights saved in num_authors3/pytorch_adapter.bin
Configuration saved in num_authors3/head_config.json
Module weights saved in num_authors3/pytorch_model_head.bin


accuracy with 3 authors per label: 0.67
Working with 6 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1680
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 136


    Done

Total Instances: 1680



Step,Training Loss
2,0.6828
4,0.6644
6,0.6521
8,0.653
10,0.5899
12,0.5451
14,0.6214
16,0.6103
18,0.5816
20,0.5395




Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.7164285714285714: 100%|██████████████████████████████████████████████████████| 2800/2800 [11:57<00:00,  3.90it/s]
Configuration saved in num_authors6/adapter_config.json
Module weights saved in num_authors6/pytorch_adapter.bin
Configuration saved in num_authors6/head_config.json
Module weights saved in num_authors6/pytorch_model_head.bin


accuracy with 6 authors per label: 0.7164285714285714
Working with 12 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3360
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 272


    Done

Total Instances: 3360



Step,Training Loss
2,0.7067
4,0.6611
6,0.6609
8,0.6259
10,0.5931
12,0.5943
14,0.5828
16,0.6011
18,0.5501
20,0.562




Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.7328571428571429: 100%|██████████████████████████████████████████████████████| 2800/2800 [11:57<00:00,  3.90it/s]
Configuration saved in num_authors12/adapter_config.json
Module weights saved in num_authors12/pytorch_adapter.bin
Configuration saved in num_authors12/head_config.json
Module weights saved in num_authors12/pytorch_model_head.bin


accuracy with 12 authors per label: 0.7328571428571429
Working with 15 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 4200
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 336


    Done

Total Instances: 4200



Step,Training Loss
2,0.6891
4,0.6913
6,0.6371
8,0.604
10,0.6204
12,0.6229
14,0.5672
16,0.599
18,0.5506
20,0.5834




Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.735: 100%|███████████████████████████████████████████████████████████████████| 2800/2800 [12:01<00:00,  3.88it/s]
Configuration saved in num_authors15/adapter_config.json
Module weights saved in num_authors15/pytorch_adapter.bin
Configuration saved in num_authors15/head_config.json
Module weights saved in num_authors15/pytorch_model_head.bin


accuracy with 15 authors per label: 0.735
Working with 30 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 8400
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 672


    Done

Total Instances: 8400



Step,Training Loss
2,0.691
4,0.6665
6,0.6629
8,0.6272
10,0.6458
12,0.6354
14,0.6441
16,0.5783
18,0.5949
20,0.6005


Saving model checkpoint to parallel_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


acc: 0.7460714285714286: 100%|██████████████████████████████████████████████████████| 2800/2800 [12:09<00:00,  3.84it/s]
Configuration saved in num_authors30/adapter_config.json
Module weights saved in num_

accuracy with 30 authors per label: 0.7460714285714286
Working with 60 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...


Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 16800
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 1344


    Done

Total Instances: 16800



Step,Training Loss
2,0.6909
4,0.6862
6,0.6842
8,0.6699
10,0.6376
12,0.6346
14,0.6263
16,0.6256
18,0.637
20,0.6742


Saving model checkpoint to parallel_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to parallel_adapter_checkPoints_es/gender/checkpoint-1000
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-1000/gender/adapter_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-1000/gender/pytorch_adapter.bin

accuracy with 60 authors per label: 0.7678571428571429
Working with 90 authors per label ... 

Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 25200



Adding adapter 'gender'.
Adding head 'gender' with config {'head_type': 'classification', 'num_labels': 2, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'use_pooler': False, 'bias': True}.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 25200
  Num Epochs = 8
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 100
  Gradient Accumulation steps = 1
  Total optimization steps = 2016


Step,Training Loss
2,0.7152
4,0.6957
6,0.6699
8,0.6734
10,0.6706
12,0.6719
14,0.6414
16,0.639
18,0.6695
20,0.6628


Saving model checkpoint to parallel_adapter_checkPoints_es/gender/checkpoint-500
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/adapter_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/head_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-500/gender/pytorch_model_head.bin
Saving model checkpoint to parallel_adapter_checkPoints_es/gender/checkpoint-1000
Configuration saved in parallel_adapter_checkPoints_es/gender/checkpoint-1000/gender/adapter_config.json
Module weights saved in parallel_adapter_checkPoints_es/gender/checkpoint-1000/gender/pytorch_adapter.bin

accuracy with 90 authors per label: 0.7767857142857143


In [11]:
accuracy

{1: 0.635,
 3: 0.67,
 6: 0.7164285714285714,
 12: 0.7328571428571429,
 15: 0.735,
 30: 0.7460714285714286,
 60: 0.7678571428571429,
 90: 0.7767857142857143}