## Paquetes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

## Parametros

In [2]:
_LANGUAGE_         = 'es'
_PRETRAINED_LM_    = 'pysentimiento/robertuito-base-cased'
_PREPROCESS_TEXT_  = True
_TWEET_BATCH_SIZE_ = 5
_MAX_SEQ_LEN_      = 128

## Dataset

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
    
vocab = tokenizer.get_vocab()

In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17


baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 56000



## Modelo con Adapters pre-entrenados

In [11]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['variety']

In [8]:
from transformers import AutoAdapterModel

model = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)

Some weights of the model checkpoint at pysentimiento/robertuito-base-cased were not used when initializing RobertaAdapterModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at pysentimiento/robertuito-base-cased and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

In [9]:
_ADAPTERS_ = ['Pfeiffer', 'Houlsby', 'Parallel', 'PrefixTuning', 'LoRA', 'MAM', 'UniPELT']

for adapter in _ADAPTERS_:
    name = model.load_adapter(adapter_name_or_path = './' + adapter + '_weights_' + tasks[0], load_as = adapter)
    print(name)

Pfeiffer
Houlsby
Parallel
PrefixTuning
LoRA
MAM
UniPELT


In [10]:
device = "cuda:0" if torch.cuda.is_available() == True else "cpu"

model = model.to(device)

## Testing

In [12]:
from tqdm import tqdm
import torch
import transformers.adapters.composition as AC  
import numpy as np

count = 0

def test_model_with_adapters(model, baseTest, adapters):
    
    label  = "variety"
    device = "cuda:0" if torch.cuda.is_available() == True else "cpu"
    
    num_labels_dict  = {'gender': 2, 'variety': len(baseTest.variety_dict)}
    successful_preds = 0
    
    with torch.no_grad():
        
        count = 0
        pbar  = tqdm(baseTest.authors)
        
        for author in pbar:
            # finds all instances of author
            author_idx = [idx for idx in range(len(baseTest.data)) if baseTest.data[idx]['author'] == author]

            # get truth labels with fst instance and initialize scores
            fst      = baseTest.data[author_idx[0]]
            truth    = fst[label]
            scores   = np.zeros( (len(adapters), num_labels_dict[label]) )

            for idx in author_idx:
                # creates case in device
                case = {key: torch.tensor(val[idx]).to(device) for key, val in baseTest.encodings.items()}

                # computes all task predictions in parallel
                preds = []
                
                for adapter in adapters:
                    model.set_active_adapters(adapter)
                    preds.append( model(**case) )
                
                # get prediction and accumulate
                for i in range(len(adapters)):
                    y = torch.nn.functional.softmax(preds[i]['logits'], dim = 1).cpu().numpy()[0]
                    scores[i] += y
            
            votes = [0]*num_labels_dict[label]
            for i in range(len(adapters)):
                adapter_label = np.argmax( scores[i] )
                votes[adapter_label] += 1
            
            # Discreto
            #final_prediction = votes.index( max(votes) )
            
            # Continuo
            final_prediction = np.argmax( scores.sum(axis = 0) )
            
            if final_prediction == truth:
                successful_preds += 1
            
            count += 1
            pbar.set_description("acc: " + str(successful_preds/count))
            
            
    accuracy = successful_preds / len(baseTest.authors)
    
    return accuracy#, case

In [13]:
accuracy = test_model_with_adapters(model, baseTest, _ADAPTERS_)

acc: 0.9525: 100%|████████████████████████████████████████████████████████████████| 2800/2800 [1:34:56<00:00,  2.03s/it]


In [14]:
accuracy

0.9525