# Initial commands

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parámetros

In [7]:
# MODEL

_LANGUAGE_        = 'es'
_TWEET_BATCH_SIZE_ = 3
_ADAPTER_CONFIG_   = transformers.ParallelConfig()


# TRAIN

_NO_GPUS_          = 5
_BATCH_SIZE_       = 100
_EPOCHS_           = 10

# Dataset

---
Para cada lenguaje se va a utilizar un modelo diferente. En el caso de español, se utilizará Robertuito, el cual tiene su propio tokenizador.

In [8]:
from transformers import AutoTokenizer

if _LANGUAGE_ == 'es':
    tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-cased')
    
vocab = tokenizer.get_vocab()

---
Por cada lenguaje se utilizarán tres modelos diferentes. Un modelo base (Robertuito para español) y un conjunto de adapters por cada etiqueta que se quiera predecir: **gender**, **variety** y **joint**. 

Hay que representar las etiquetas de forma numérica: 

* Se utilizará siempre 0 para female y 1 para male, en el caso de **gender**. 
* Para **variety** depende de cada lenguaje. Los diccionarios asociados a cada lenguaje se crean en la siguiente celda de código. 
* Los valores numéricos de **joint** siempre estarán en función de los de gender y variety: si $ g\in \{ 0, 1\} $ representa la etiqueta numérica de gender y $ v\in \{ 1, ..., m \} $ la de la variety, entonces la de joint estará dada por: $$ j = g*m + v $$

In [9]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyAR_dict = {'egypt'    : 0,
                  'gulf'     : 1,
                  'levantine': 2,
                  'maghrebi' : 3}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'gran britain' : 2,
                  'ireland'      : 3,
                  'new zeland'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}

varietyPT_dict = {'brazil'  : 0,
                  'portugal': 1}

if _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

---
La siguiente celda contiene el bloque de código de la clase **BasePAN17**. Ésta se utilizará para cargar todos los tweets de un solo lenguaje. 

Ya que se utilizarán tres modelos por cada lenguaje, con diferentes etiquetas, es necesario tener tres DataLoaders. Sin embargo, estos usan los mismos tweets como entrada. La clase **BasePAN17** sirve para tener una sola instancia de los tweets.

In [10]:
# CREATE BASE CLASS -----------------------------------------------------------------------

import xml.etree.ElementTree as ET
from random import shuffle
from pysentimiento.preprocessing import preprocess_tweet


class BasePAN17():
    
    def __init__(self, Dir, split, language, tokenizer, gender_dict, variety_dict, tweet_batch_size):
        self.Dir          = Dir
        self.split        = split
        self.language     = language
        self.tokenizer    = tokenizer
        self.gender_dict  = gender_dict
        self.variety_dict = variety_dict
        self.tw_bsz       = tweet_batch_size
        
        print("\nReading data...")
        
        self.authors   = self.get_authors(Dir, split, language)
        self.author_lb = self.get_author_labels(Dir, split, language)
        self.data      = self.get_tweets_in_batches(Dir, split, language)
        
        shuffle(self.data)
        
        
        print("    Done\nPreprocessing text...")
        
        preprocessed   = [preprocess_tweet(instance['text']) for instance in self.data]
        
        print("    Done\nTokenizing...")
        
        self.encodings = self.tokenizer(preprocessed, max_length = 128, 
                                                      truncation = True, 
                                                      padding    = True,
                                                      return_token_type_ids = False)
         
        print("    Done\n\nTotal Instances: " + str(len(self.data)) + '\n')

        
    def get_authors(self, Dir, split, language):
        path    = os.path.join(Dir, split, language)
        files   = os.listdir(path)
        authors = [ file[0:-4] for file in files ] 
        
        return authors
    
    
    def get_author_labels(self, Dir, split, language):
        lb_file_name = os.path.join(Dir, split, language + '.txt')
        lb_file      = open(lb_file_name, "r")
        author_lb    = dict()

        for line in lb_file:
            author, gender, variety = line.split(':::')
            variety = variety[:-1]                       

            gl = self.gender_dict[gender]
            vl = self.variety_dict[variety]
            jl = gl*len(self.variety_dict) + vl

            author_lb[author] = {'gender': gl, 'variety': vl, 'joint': jl}

        lb_file.close()
        
        return author_lb
     
    def get_tweets_in_batches(self, Dir, split, language):
        data   = []

        for author in self.authors:
            tw_file_name = os.path.join(Dir, split, language, author + '.xml')
            tree         = ET.parse(tw_file_name)
            root         = tree.getroot()
            documents    = root[0]

            for i in range(0, len(documents), self.tw_bsz):
                doc_batch = documents[i : i + self.tw_bsz]
                tweets    = ''

                for document in doc_batch:
                    tweets += document.text + '\n'

                data.append( {'author': author, 'text': tweets, **self.author_lb[author]} )
        
        return data


In [11]:
# CREATE JUST ONE INSTANCE PER LANGUAGE ---------------------------------------------------



baseTrain = BasePAN17(Dir              = 'data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_)

baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_)


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 142798


Reading data...
    Done
Preprocessing text...
    Done
Tokenizing...
    Done

Total Instances: 95200



In [12]:
c = 0
sizes_dict = {}
for idx in range(len(baseTrain.data)):
    
    if baseTrain.encodings['input_ids'][idx][-1] != 1:
        c += 1
    
    size = baseTrain.encodings['input_ids'][idx].index(2) + 1
    if size in sizes_dict:
        sizes_dict[size] += 1
    else:
        sizes_dict[size] = 1

print("Total train truncated instances: ", c)

Total train truncated instances:  1464


In [13]:
c = 0
sizes_dict = {}
for idx in range(len(baseTest.data)):
    
    if baseTest.encodings['input_ids'][idx][-1] != 1:
        c += 1
    
    size = baseTest.encodings['input_ids'][idx].index(2) + 1
    if size in sizes_dict:
        sizes_dict[size] += 1
    else:
        sizes_dict[size] = 1

print("Total test truncated instances: ", c)

Total test truncated instances:  938


---
La siguiente clase será el data loader, se usará un dataloader para cada etiqueta (gender, variety y joint) y cada splot (train, test)

In [14]:
from torch.utils.data import Dataset

class DatasetPAN17(Dataset):
    
    def __init__(self, Base_Dataset, label):
        self.Base_Dataset = Base_Dataset
        self.label        = label
        
    def __len__(self):
        
        return len(self.Base_Dataset.data)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.Base_Dataset.encodings.items()}
        item['labels'] = torch.tensor(self.Base_Dataset.data[idx][self.label])
        
        return item

In [15]:
train_gender  = DatasetPAN17(Base_Dataset = baseTrain, label = 'gender')
train_variety = DatasetPAN17(Base_Dataset = baseTrain, label = 'variety')
train_joint   = DatasetPAN17(Base_Dataset = baseTrain, label = 'joint')

# Model

In [16]:
from transformers import RobertaConfig, RobertaModelWithHeads

if _LANGUAGE_ == 'es':
    config = RobertaConfig.from_pretrained(
        "pysentimiento/robertuito-base-cased",
        num_labels=2,
    )
    model = RobertaModelWithHeads.from_pretrained(
        "pysentimiento/robertuito-base-cased",
        config=config,
    )

Some weights of the model checkpoint at pysentimiento/robertuito-base-cased were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at pysentimiento/robertuito-base-cased and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

# Adapters

In [17]:
# name the task

tasks           = ["gender", "variety", "joint"]
num_v           = len(baseTrain.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v, "joint": 2*num_v}


# Add adapters

for task_name in tasks:
    
    model.add_adapter(
        adapter_name = task_name, 
        config       = _ADAPTER_CONFIG_
    )
    
    model.add_classification_head(
        head_name    = task_name,
        num_labels   = num_labels_dict[task_name],
      )

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)

RobertaModelWithHeads(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (prefix_tuning): PrefixTuningShim(


# Training

In [19]:
from transformers import TrainingArguments, AdapterTrainer

In [21]:
dataset_dict = {"gender": train_gender, "variety": train_variety, "joint": train_joint}
test_dict    = {"gender": train_gender, "variety": train_variety, "joint": train_joint}

for task_name in tasks:
    
    model.set_active_adapters(task_name)
    model.train_adapter(task_name)
    
    training_args = TrainingArguments(
        learning_rate               = 1e-4,
        #weight_decay                 = 0.01,
        num_train_epochs            = _EPOCHS_,
        per_device_train_batch_size = _BATCH_SIZE_,
        per_device_eval_batch_size  = _BATCH_SIZE_,
        logging_steps               = (len(baseTrain.data)/(_BATCH_SIZE_*_NO_GPUS_))/5 ,
        output_dir                  = "./training_output_" + task_name,
        overwrite_output_dir        = True,
        # The next line is important to ensure the dataset labels are properly passed to the model
        remove_unused_columns       = False,
    )

    trainer = AdapterTrainer(
        model           = model,
        args            = training_args,
        train_dataset   = dataset_dict[task_name],
    )
    trainer.args._n_gpu = _NO_GPUS_
    
    trainer.train()

***** Running training *****
  Num examples = 142798
  Num Epochs = 10
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 500
  Gradient Accumulation steps = 1
  Total optimization steps = 2860


Step,Training Loss


Saving model checkpoint to ./training_output_gender/checkpoint-500
Configuration saved in ./training_output_gender/checkpoint-500/gender/adapter_config.json
Module weights saved in ./training_output_gender/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in ./training_output_gender/checkpoint-500/gender/head_config.json
Module weights saved in ./training_output_gender/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_gender/checkpoint-500/variety/adapter_config.json
Module weights saved in ./training_output_gender/checkpoint-500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_gender/checkpoint-500/variety/head_config.json
Module weights saved in ./training_output_gender/checkpoint-500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_gender/checkpoint-500/joint/adapter_config.json
Module weights saved in ./training_output_gender/checkpoint-500/joint/pytorch_adapter.bin
Configuration saved in ./t

Module weights saved in ./training_output_gender/checkpoint-2500/gender/pytorch_adapter.bin
Configuration saved in ./training_output_gender/checkpoint-2500/gender/head_config.json
Module weights saved in ./training_output_gender/checkpoint-2500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_gender/checkpoint-2500/variety/adapter_config.json
Module weights saved in ./training_output_gender/checkpoint-2500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_gender/checkpoint-2500/variety/head_config.json
Module weights saved in ./training_output_gender/checkpoint-2500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_gender/checkpoint-2500/joint/adapter_config.json
Module weights saved in ./training_output_gender/checkpoint-2500/joint/pytorch_adapter.bin
Configuration saved in ./training_output_gender/checkpoint-2500/joint/head_config.json
Module weights saved in ./training_output_gender/checkpoint-2500/joint/pytorch_model_he

Step,Training Loss


Saving model checkpoint to ./training_output_variety/checkpoint-500
Configuration saved in ./training_output_variety/checkpoint-500/gender/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in ./training_output_variety/checkpoint-500/gender/head_config.json
Module weights saved in ./training_output_variety/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_variety/checkpoint-500/variety/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_variety/checkpoint-500/variety/head_config.json
Module weights saved in ./training_output_variety/checkpoint-500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_variety/checkpoint-500/joint/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-500/joint/pytorch_adapter.bin
Configuration s

Configuration saved in ./training_output_variety/checkpoint-2500/gender/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-2500/gender/pytorch_adapter.bin
Configuration saved in ./training_output_variety/checkpoint-2500/gender/head_config.json
Module weights saved in ./training_output_variety/checkpoint-2500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_variety/checkpoint-2500/variety/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-2500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_variety/checkpoint-2500/variety/head_config.json
Module weights saved in ./training_output_variety/checkpoint-2500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_variety/checkpoint-2500/joint/adapter_config.json
Module weights saved in ./training_output_variety/checkpoint-2500/joint/pytorch_adapter.bin
Configuration saved in ./training_output_variety/checkpoint-2500/joint/he

Step,Training Loss


Saving model checkpoint to ./training_output_joint/checkpoint-500
Configuration saved in ./training_output_joint/checkpoint-500/gender/adapter_config.json
Module weights saved in ./training_output_joint/checkpoint-500/gender/pytorch_adapter.bin
Configuration saved in ./training_output_joint/checkpoint-500/gender/head_config.json
Module weights saved in ./training_output_joint/checkpoint-500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_joint/checkpoint-500/variety/adapter_config.json
Module weights saved in ./training_output_joint/checkpoint-500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_joint/checkpoint-500/variety/head_config.json
Module weights saved in ./training_output_joint/checkpoint-500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_joint/checkpoint-500/joint/adapter_config.json
Module weights saved in ./training_output_joint/checkpoint-500/joint/pytorch_adapter.bin
Configuration saved in ./training_out

Configuration saved in ./training_output_joint/checkpoint-2500/gender/head_config.json
Module weights saved in ./training_output_joint/checkpoint-2500/gender/pytorch_model_head.bin
Configuration saved in ./training_output_joint/checkpoint-2500/variety/adapter_config.json
Module weights saved in ./training_output_joint/checkpoint-2500/variety/pytorch_adapter.bin
Configuration saved in ./training_output_joint/checkpoint-2500/variety/head_config.json
Module weights saved in ./training_output_joint/checkpoint-2500/variety/pytorch_model_head.bin
Configuration saved in ./training_output_joint/checkpoint-2500/joint/adapter_config.json
Module weights saved in ./training_output_joint/checkpoint-2500/joint/pytorch_adapter.bin
Configuration saved in ./training_output_joint/checkpoint-2500/joint/head_config.json
Module weights saved in ./training_output_joint/checkpoint-2500/joint/pytorch_model_head.bin
Configuration saved in ./training_output_joint/checkpoint-2500/gender/head_config.json
Module w

# Testing

In [22]:
import transformers.adapters.composition as AC  

model.set_active_adapters(AC.Parallel(*tasks))

In [23]:
from tqdm import tqdm

successful_preds = { task: 0 for task in tasks }

with torch.no_grad():
    for author in tqdm(baseTest.authors):
        # finds all instances of author
        author_idx = [idx for idx in range(len(baseTest.data)) if baseTest.data[idx]['author'] == author]
        
        # get truth labels with fst instance and initialize scores
        fst      = baseTest.data[author_idx[0]]
        truth    = { task: fst[task]                         for task in tasks }
        scores   = { task: np.zeros( num_labels_dict[task] ) for task in tasks }
        
        for idx in author_idx:
            # creates case in device
            case = {key: torch.tensor(val[idx]).to(device) for key, val in baseTest.encodings.items()}

            # computes all task predictions in parallel
            preds = list( model(**case) )
            
            # get prediction and accumulate
            for task, pred in zip(tasks, preds):
                y = torch.nn.functional.softmax(pred['logits'], dim = 1).cpu().numpy()[0]
                #print(task, y)
                scores[task] += y
        
        for task in tasks:
            if np.argmax( scores[task] ) == truth[task]:
                successful_preds[task] += 1

accuracy = { task: val/len(baseTest.authors) for task, val in successful_preds.items() }

100%|███████████████████████████████████████| 2800/2800 [25:08<00:00,  1.86it/s]


In [24]:
print("Accuracy in the three tasks")
print(accuracy)

Accuracy in the three tasks
{'gender': 0.8232142857142857, 'variety': 0.9439285714285715, 'joint': 0.7667857142857143}


In [16]:
print("Accuracy in the three tasks")
print(accuracy)

Accuracy in the three tasks
{'gender': 0.8442857142857143, 'variety': 0.95, 'joint': 0.7960714285714285}


In [17]:
best_accuracy = {"gender": 0.8321, "variety": 0.9625, "joint": 0.8036}
print("Best accuracy")
print(best_accuracy)

Best accuracy
{'gender': 0.8321, 'variety': 0.9625, 'joint': 0.8036}
