# Paquetes

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import transformers
import os

# Parametros

In [2]:
# MODEL

_LANGUAGE_         = 'es'
_PRETRAINED_LM_    = 'dccuchile/bert-base-spanish-wwm-cased'
_PREPROCESS_TEXT_  = False
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = None
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'fine_tuning_checkPoints_en'
_LOGGING_STEPS_    = 50


# TRAIN

_NO_GPUS_          = 2
_BATCH_SIZE_       = 100
_EPOCHS_           = {'gender': 10, 'variety': 10}
_LEARNING_RATE_    = 1e-5

# Dataset

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
vocab = tokenizer.get_vocab()

In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17

baseTrain = BasePAN17(Dir              = 'data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)

baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_,
                      preprocess_text  = _PREPROCESS_TEXT_)


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 84000


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 56000



In [7]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender', 'variety']

dataset_dict = {}
for task in tasks:
    dataset_dict[task] = DatasetPAN17(Base_Dataset = baseTrain, label = task)

# Model

In [8]:
from transformers import AutoAdapterModel

num_v           = len(baseTrain.variety_dict)
num_labels_dict = {"gender": 2, "variety": num_v,}
device = "cuda:0" if torch.cuda.is_available() else "cpu"


models = {}

for task in tasks:

    models[task] = AutoAdapterModel.from_pretrained(_PRETRAINED_LM_)

    models[task].add_classification_head(
        head_name    = task,
        num_labels   = num_labels_dict[task],
    )

    models[task] = models[task].to(device)


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertAdapterModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertAdapterModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initial

# Training

In [9]:
from Training import train_models

train_models(models        = models,
             dataset_dict  = dataset_dict,
             epochs        = _EPOCHS_,
             batch_size    = _BATCH_SIZE_,
             no_gpus       = _NO_GPUS_,
             output_dir    = _OUTPUT_DIR_, 
             logging_steps = _LOGGING_STEPS_,
             learning_rate = _LEARNING_RATE_)

***** Running training *****
  Num examples = 84000
  Num Epochs = 10
  Instantaneous batch size per device = 100
  Total train batch size (w. parallel, distributed & accumulation) = 200
  Gradient Accumulation steps = 1
  Total optimization steps = 4200


Step,Training Loss
50,0.6439
100,0.5966
150,0.5931
200,0.5868
250,0.571
300,0.5746
350,0.5673
400,0.5633
450,0.5423
500,0.5411


Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-500
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-500/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-1000
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-1000/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-1500
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-1500/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/gender/checkpoint-2000
Configuration saved in fine_tuning_checkPoints_en/gender/checkpoint-2000/config.json
Model weights saved in fine_tuning_checkPoints_en/gender/checkpoint-2000/pytorch_m

Step,Training Loss
50,1.8351
100,1.4894
150,1.2783
200,1.1509
250,1.1017
300,1.0575
350,1.0386
400,1.0017
450,0.9699
500,0.9063


Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-500
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-500/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-1000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-1000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-1500
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-1500/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to fine_tuning_checkPoints_en/variety/checkpoint-2000
Configuration saved in fine_tuning_checkPoints_en/variety/checkpoint-2000/config.json
Model weights saved in fine_tuning_checkPoints_en/variety/checkpoint-20

# Testing

In [10]:
from TestingPAN17 import test_models

accuracy = test_models(models, baseTest)

100%|███████████████████████████████████████| 2800/2800 [15:33<00:00,  3.00it/s]


In [11]:
accuracy

{'gender': 0.8232142857142857,
 'variety': 0.9353571428571429,
 'joint': 0.7767857142857143}