# Paquetes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parametros

In [2]:
# MODEL

_LANGUAGE_         = 'en'
_PRETRAINED_LM_    = 'vinai/bertweet-base'
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = None
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'fine_tuning_checkPoints_en'
_LOGGING_STEPS_    = 50


# TRAIN

_NO_GPUS_          = 2
_BATCH_SIZE_       = 100
_EPOCHS_           = {'gender': 20, 'variety': 25}
_LEARNING_RATE_    = 1e-5

# Dataset

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17

baseTrain = BasePAN17(Dir              = 'data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)

baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 72000


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 48000



In [7]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender', 'variety']

dataset_dict = {}
for task in tasks:
    dataset_dict[task] = DatasetPAN17(Base_Dataset = baseTrain, label = task)

# Model

In [8]:
from transformers import AutoAdapterModel

num_v           = len(baseTrain.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v,}
device = "cuda:0" if torch.cuda.is_available() else "cpu"


models = {}

for task in tasks:

    models[task] = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)

    models[task].add_classification_head(
        head_name    = task,
        num_labels   = num_labels_dict[task],
    )

    models[task] = models[task].to(device)


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaAdapterModel: ['lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaAdapterModel: ['lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.w

# Training

In [9]:
from Training import train_models

train_models(models        = models,
             dataset_dict  = dataset_dict,
             epochs        = _EPOCHS_,
             batch_size    = _BATCH_SIZE_,
             no_gpus       = _NO_GPUS_,
             output_dir    = _OUTPUT_DIR_, 
             logging_steps = _LOGGING_STEPS_,
             learning_rate = _LEARNING_RATE_)

***** Running training *****
  Num examples = 72000
  Num Epochs = 20
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 200
  Gradient Accumulation steps = 1
  Total optimization steps = 7200


Step,Training Loss
50,0.6581
100,0.5719
150,0.5605
200,0.5355
250,0.5256
300,0.52
350,0.518
400,0.5208
450,0.4934
500,0.4936


Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-500
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-500/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-1000
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-1000/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-1500
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-1500/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-2000
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-2000/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-2000/pytorch_m

***** Running training *****
  Num examples = 72000
  Num Epochs = 25
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 200
  Gradient Accumulation steps = 1
  Total optimization steps = 9000


Step,Training Loss
50,1.6998
100,1.4635
150,1.3467
200,1.2687
250,1.2013
300,1.1277
350,1.0714
400,1.0339
450,0.984
500,0.9798


Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-500
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-500/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-1000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-1000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-1500
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-1500/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-2000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-2000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-20

Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-8000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-8000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-8000/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-8500
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-8500/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-8500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-9000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-9000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-9000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




# Testing

In [10]:
from TestingPAN17 import test_models

accuracy = test_models(models, baseTest)

100%|███████████████████████████████████████| 2400/2400 [14:55<00:00,  2.68it/s]


In [11]:
accuracy

{'gender': 0.8204166666666667, 'variety': 0.8491666666666666, 'joint': 0.7}