In [1]:
import torch
import transformers
import numpy as np

In [2]:
# MODEL

_LANGUAGE_         = 'en'
_PRETRAINED_LM_    = 'bert-base-uncased'
_TWEET_BATCH_SIZE_ = 5
_ADAPTER_CONFIG_   = transformers.ParallelConfig()
_MAX_SEQ_LEN_      = 128
_OUTPUT_DIR_       = 'parallel_adapter_checkPoints_en'
_LOGGING_STEPS_    = 50


# TRAIN

_NO_GPUS_          = 2
_BATCH_SIZE_       = 100
_EPOCHS_           = {'gender': 20, 'variety': 25}
_LEARNING_RATE_    = 1e-4

In [3]:
# LABEL DICTONARIES -----------------------------------------------------------------------

gender_dict    = {'female': 0, 
                  'male':   1}

varietyEN_dict = {'australia'    : 0,
                  'canada'       : 1,
                  'great britain' : 2,
                  'ireland'      : 3,
                  'new zealand'   : 4,
                  'united states': 5}

varietyES_dict = {'argentina': 0,
                  'chile'    : 1,
                  'colombia' : 2,
                  'mexico'   : 3,
                  'peru'     : 4,
                  'spain'    : 5,
                  'venezuela': 6}    

In [4]:
# SET LANGUAGE DIRECTORY

if _LANGUAGE_ == 'en':
    variety_dict = varietyEN_dict

elif _LANGUAGE_ == 'es':
    variety_dict = varietyES_dict

In [6]:
# SET LANGUAGE TOKENIZER

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(_PRETRAINED_LM_)
    
vocab = tokenizer.get_vocab()

In [7]:
# CREATE ONE INSTANCE PER DATA SPLIT

from DatasetPAN17 import BasePAN17, DatasetPAN17

baseTrain = BasePAN17(Dir              = 'data',
                      split            = 'train',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)

baseTest  = BasePAN17(Dir              = 'data',
                      split            = 'test',
                      language         = _LANGUAGE_,
                      tokenizer        = tokenizer,
                      gender_dict      = gender_dict,
                      variety_dict     = variety_dict,
                      tweet_batch_size = _TWEET_BATCH_SIZE_,
                      max_seq_len      = _MAX_SEQ_LEN_)


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 72000


Reading data...
    Done
Tokenizing...
    Done

Total Instances: 48000



In [19]:
case = dataset_dict['gender'].__getitem__(0)
case

{'input_ids': tensor([  101,  2008,  1005,  1055,  1996,  2518,  1012,  2057,  2123,  1005,
          1056,  2113,  2593,  1012, 16770,  1024,  1013,  1013,  1056,  1012,
          2522,  1013,  1022,  2226,  2475,  2278, 18684,  4710,  2546,  4160,
          2023,  1012,  1030,  2149,  3654,  1030,  2149, 26915, 24141,  2546,
         16770,  1024,  1013,  1013,  1056,  1012,  2522,  1013,  2033,  2480,
          2213,  2575, 16558, 13871,  2860,  6865,  1996,  5384,  2058,   100,
          1998,  3288,  2041,  1996, 10535,  2015,   999, 16770,  1024,  1013,
          1013,  1056,  1012,  2522,  1013,  1022,  8458,  2683,  2243,  2595,
         21906,  2581,  2063,  1030, 10978, 15143, 10230,  2100,  1004, 14181,
          1025,  3598, 10439, 23142,  1029,  8440,  1005,  1056,  7172,  1999,
          1037,  2261,  2847,  1012,  1012,  1030,  4012, 10526, 16302,  2015,
          1004, 14181,  1025,  6209,  5353,  2041,  4270,  1012,  5171,  1012,
         17111,  9541,  5580,  1045,  7

In [10]:
# CREATE DATA LOADER FOR EVERY TASK

tasks = ['gender', 'variety']

dataset_dict = {}
for task in tasks:
    dataset_dict[task] = DatasetPAN17(Base_Dataset = baseTrain, label = task)

In [20]:
for idx in case['input_ids']:
    print(id2w[idx.item()])

[CLS]
that
'
s
the
thing
.
we
don
'
t
know
either
.
https
:
/
/
t
.
co
/
8
##u
##2
##c
##xa
##ay
##f
##q
this
.
@
us
##ga
@
us
##open
##gol
##f
https
:
/
/
t
.
co
/
me
##z
##m
##6
##bl
##gg
##w
hang
the
trophy
over
[UNK]
and
bring
out
the
ladder
##s
!
https
:
/
/
t
.
co
/
8
##ph
##9
##k
##x
##cp
##7
##e
@
espn
##fan
##tas
##y
&
gt
;
baseball
app
busted
?
hasn
'
t
updated
in
a
few
hours
.
.
@
com
##cast
##care
##s
&
gt
;
holiday
weekend
out
##age
.
typical
.
soo
##oo
glad
i
switched
to
you
[SEP]


In [21]:
baseTrain.data[0]

{'author': '2c3fa19946483f791727f494fe856c5a',
 'text': "That's the thing. We don't know either.  https://t.co/8U2CXAaYfq\nThis. @USGA @usopengolf  https://t.co/Mezm6bLGGw\nHang the trophy over 18⛳️ and bring out the ladders! https://t.co/8Ph9KXcP7E\n@ESPNFantasy &gt; baseball app busted? Hasn't updated in a few hours.\n.@comcastcares &gt; holiday weekend outage. Typical. Soooo glad I switched to you this month. No #Indy500 #Nats #pgatour\n",
 'gender': 1,
 'variety': 5}

In [18]:
from transformers import AutoAdapterModel

model = AutoAdapterModel.from_pretrained('vinai/bertweet-base')

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaAdapterModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
model

RobertaAdapterModel(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (prefix_tuning): PrefixTuningShim(
  