In [1]:
import torch
import transformers
import numpy as np

In [2]:
# MODEL

_LANGUAGE_         = 'en'
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = None
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'feature_extractor_checkPoints'
_LOGGING_STEPS_    = 50


# TRAIN

_NO_GPUS_          = 1
_BATCH_SIZE_       = 100
_EPOCHS_           = 10
_LEARNING_RATE_    = 1e-4

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [5]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer

if _LANGUAGE_ == 'es':
    tokenizer = AutoTokenizer.from_pretrained('pysentimiento/robertuito-base-cased')
    
elif _LANGUAGE_ == 'en':
    tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
    
vocab = tokenizer.get_vocab()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17

baseTrain = BasePAN17(Dir              = 'data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)

baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 72000


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 48000



In [7]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender', 'variety']

dataset_dict = {}
for task in tasks:
    dataset_dict[task] = DatasetPAN17(Base_Dataset = baseTrain, label = task)

In [10]:
baseTrain.data[0]

{'author': '6bd70e9370f477d1da563a495f5de9f4',
 'text': '@Joe_Ralls Cheers Joe, well done today, absolutely buzzing! Good luck for the rest of the season.\nMy boy Rhys met his favourite player last week in Brighton. Thanks @Joe_Ralls you were awesome. https://t.co/c2znsU88CY\nWatching #TopOfThePops 1983 BBC4. If pop music could be bottled then this surely would be a vintage year\nGame called off! What to do in Brighton?? Oh well only one thing for it https://t.co/HJbMU1DUHL\nRhys just had a word in their ears about ensuring we get 3 points tonight at #Brighton #bhfcvccfc #ccfc https://t.co/e195MM9NOD\n',
 'gender': 1,
 'variety': 2}

In [13]:
case = dataset_dict['gender'].__getitem__(0)
case

{'input_ids': tensor([    0,  5238,  2966,  6685, 26894,  3286,  7070, 59568,     7,   182,
           270, 18814,     7,  1518, 29329,   856,    12,   417,  1002,    19,
             6,   765,    15,     6, 22531,   639,     3,   122,   529, 51433,
          1137,    91,  2060,  1082,   175,   223,    16,  5987,  2376,  3036,
             4,   404,  5238,  2966,  6685, 26894,  3286,    14,   147, 54663,
             4, 62060,  8798,     3, 11412,   715,   674,   685, 20084,   684,
          5744, 16403,     3,  1873, 13256, 13853, 26021, 10813, 30871,   567,
             4,   125,  1877,   389,   152,    31, 30196,   133,    33,  5416,
            86,    31,    11, 10649, 11186,     3,  1367,   488,  6386,    12,
           165,     9,    32,    16,  5987,  2376,  3036, 50239,    21,   385,
           182,   121,    63,   195,    19,    18, 62060,  8798,     3, 11412,
           544,   611,   607,  9029,  1032,  8435, 30829,     3, 51433,    45,
           118,    11,   755,    16,   

In [15]:
for idx in case['input_ids']:
    print(id2w[idx.item()])

<s>
@@@
Jo@@
e_@@
Ral@@
ls
Cheers
Joe@@
,
well
done
today@@
,
absolutely
buzz@@
ing@@
!
Good
luck
for
the
rest
of
the
season@@
.@@
<unk>
My
boy
Rhys
met
his
favourite
player
last
week
in
Bri@@
gh@@
ton@@
.
Thanks
@@@
Jo@@
e_@@
Ral@@
ls
you
were
awesome@@
.
htt@@
ps@@
<unk>
t.co/@@
c@@
2@@
z@@
ns@@
U@@
88@@
CY@@
<unk>
Watching
#Top@@
OfThe@@
Pops
1983
BBC@@
4@@
.
If
pop
music
could
be
bottled
then
this
surely
would
be
a
vintage
year@@
<unk>
Game
called
off@@
!
What
to
do
in
Bri@@
gh@@
ton@@
?@@
?
Oh
well
only
one
thing
for
it
htt@@
ps@@
<unk>
t.co/@@
H@@
J@@
b@@
MU@@
1@@
DU@@
HL@@
<unk>
Rhys
just
had
a
word
in
their
ears
about
</s>


In [11]:
id2w = {}
for word in vocab:
    id2w[vocab[word]] = word

In [12]:
id2w


{0: '<s>',
 1: '<pad>',
 2: '</s>',
 3: '<unk>',
 4: '.',
 5: '@USER',
 6: 'the',
 7: ',',
 8: 'I',
 9: 'to',
 10: 'HTTPURL',
 11: 'a',
 12: '!',
 13: 'and',
 14: 'you',
 15: 'of',
 16: 'in',
 17: 'is',
 18: 'it',
 19: 'for',
 20: "'s",
 21: '?',
 22: ':',
 23: 'my',
 24: 'on',
 25: 'that',
 26: '"',
 27: 'me',
 28: '...',
 29: "n't",
 30: 'with',
 31: 'be',
 32: 'do',
 33: 'this',
 34: '-',
 35: 'at',
 36: 'have',
 37: 'i',
 38: 'was',
 39: 'so',
 40: "'m",
 41: 'are',
 42: 'but',
 43: 'like',
 44: 'your',
 45: 'just',
 46: 'not',
 47: 'The',
 48: 'all',
 49: 'up',
 50: 'out',
 51: 'get',
 52: 'as',
 53: 'from',
 54: 'we',
 55: '&',
 56: 'can',
 57: '(',
 58: 'he',
 59: 'they',
 60: ')',
 61: 'by',
 62: 'about',
 63: 'one',
 64: 'when',
 65: 'if',
 66: 'what',
 67: '..',
 68: 'know',
 69: "'",
 70: 'will',
 71: 'love',
 72: 'or',
 73: 'You',
 74: 'an',
 75: '/',
 76: 'It',
 77: 'now',
 78: 'time',
 79: 'u',
 80: 'no',
 81: "'re",
 82: 'go',
 83: 'people',
 84: 'how',
 85: '#@@',
 86: 

In [18]:
from transformers import AutoAdapterModel

model = AutoAdapterModel.from_pretrained('vinai/bertweet-base')

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaAdapterModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
model

RobertaAdapterModel(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (prefix_tuning): PrefixTuningShim(
  