# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- flair 0.12
- allennlp 0.9.0


In [1]:
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence

embedding = TransformerWordEmbeddings('bert-base-uncased')
#embedding = TransformerWordEmbeddings('roberta-base')

sentence = Sentence('George Washington was born in Washington')

embedding.embed(sentence)

[Sentence[6]: "George Washington was born in Washington"]

In [2]:
for token in sentence:
    print(token)
    print(token.embedding)

Token[0]: "George"
tensor([-7.8309e-02,  5.0631e-01, -1.7367e-01, -5.7778e-01,  8.6016e-01,
        -4.4545e-01,  5.1505e-01,  1.9246e-01,  1.1061e-01, -8.2763e-01,
        -1.7925e-01, -5.7133e-01, -3.0537e-02,  1.1873e-01, -6.7880e-01,
        -1.3477e-01,  7.5955e-01,  1.0435e-02, -1.1160e-01,  1.5327e-01,
        -8.8353e-01,  3.9561e-01, -4.4999e-01,  2.6866e-01,  6.0894e-01,
         2.0193e-01,  1.2633e-01,  7.0724e-01,  3.7386e-02, -8.3702e-01,
         4.5177e-01, -5.0160e-01,  2.2627e-01,  5.7286e-01, -5.3280e-01,
        -2.9157e-01, -2.3168e-01,  8.5083e-01,  5.1500e-01, -3.4913e-01,
         1.7536e-01, -3.5834e-01,  7.0413e-01, -4.2270e-01,  1.7253e-01,
        -1.6541e-01,  2.0805e-01, -1.0085e+00,  6.8492e-02, -8.5723e-01,
         3.5498e-01, -1.4846e-01, -3.1606e-01,  7.2150e-01,  3.6088e-01,
         4.0602e-01, -2.5159e-01, -1.2356e-01,  5.4870e-02, -4.6386e-01,
        -1.1212e-02,  8.2393e-01,  6.5787e-01, -1.6008e-01, -4.9070e-01,
        -6.0823e-02, -1.5661e-01

In [3]:
embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1', layer_mean=False)
embeddings.embed(sentence)
print(sentence[0].embedding.size())

sentence.clear_embeddings()

embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1,-2', layer_mean=False)
embeddings.embed(sentence)
print(sentence[0].embedding.size())

sentence.clear_embeddings()


embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='all', layer_mean=False)
embeddings.embed(sentence)
print(sentence[0].embedding.size())

torch.Size([768])
torch.Size([1536])
torch.Size([9984])


In [12]:
from flair.datasets import CONLL_03_SPANISH

Corpus = CONLL_03_SPANISH()

2023-04-12 18:49:06,293 Reading data from C:\Users\marce\.flair\datasets\conll_03_spanish
2023-04-12 18:49:06,294 Train: C:\Users\marce\.flair\datasets\conll_03_spanish\esp.train
2023-04-12 18:49:06,295 Dev: C:\Users\marce\.flair\datasets\conll_03_spanish\esp.testa
2023-04-12 18:49:06,295 Test: C:\Users\marce\.flair\datasets\conll_03_spanish\esp.testb


In [13]:
label_type = 'ner'

label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)

embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True,
                                       )


2023-04-12 18:52:03,063 Computing label dictionary. Progress:


8323it [00:00, 61562.05it/s]

2023-04-12 18:52:03,201 Dictionary created for label 'ner' with 4 values: ORG (seen 7390 times), LOC (seen 4914 times), PER (seen 4321 times), MISC (seen 2173 times)
Dictionary with 4 tags: ORG, LOC, PER, MISC





Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [18]:
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict,
    tag_type='ner',
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

trainer = ModelTrainer(tagger, Corpus)

2023-04-12 19:16:22,743 SequenceTagger predicts: Dictionary with 17 tags: O, S-ORG, B-ORG, E-ORG, I-ORG, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC


In [19]:
from torch.optim.lr_scheduler import OneCycleLR

trainer.train('resources/taggers/ner-spanish-large',
              learning_rate=5.0e-6,
              mini_batch_size=4,
              mini_batch_chunk_size=1,
              max_epochs=20,
              scheduler=OneCycleLR,
              embeddings_storage_mode='none',
              weight_decay=0.,
              )

2023-04-12 19:19:35,462 ----------------------------------------------------------------------------------------------------
2023-04-12 19:19:35,464 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250003, 1024)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out

100%|████████████████████████████████████████████████████████████████████████████| 479/479 [12:49<00:00,  1.61s/it]

2023-04-12 22:30:23,099 Evaluating as a multi-label problem: False
2023-04-12 22:30:23,116 DEV : loss 1.8872603178024292 - f1-score (micro avg)  0.0005
2023-04-12 22:30:23,133 saving best model





2023-04-12 22:30:26,943 ----------------------------------------------------------------------------------------------------
2023-04-12 22:48:35,655 epoch 2 - iter 208/2081 - loss 2.23436166 - time (sec): 1088.71 - samples/sec: 24.00 - lr: 0.000005
2023-04-12 23:06:24,184 epoch 2 - iter 416/2081 - loss 2.24306099 - time (sec): 2157.24 - samples/sec: 24.16 - lr: 0.000005
2023-04-12 23:24:19,299 epoch 2 - iter 624/2081 - loss 2.22528858 - time (sec): 3232.36 - samples/sec: 24.68 - lr: 0.000005
2023-04-12 23:42:18,225 epoch 2 - iter 832/2081 - loss 2.22190112 - time (sec): 4311.28 - samples/sec: 24.62 - lr: 0.000005
2023-04-12 23:59:51,626 epoch 2 - iter 1040/2081 - loss 2.20691298 - time (sec): 5364.68 - samples/sec: 24.66 - lr: 0.000005
2023-04-13 00:18:33,720 epoch 2 - iter 1248/2081 - loss 2.19928067 - time (sec): 6486.78 - samples/sec: 24.47 - lr: 0.000005
2023-04-13 00:36:30,491 epoch 2 - iter 1456/2081 - loss 2.19414190 - time (sec): 7563.55 - samples/sec: 24.47 - lr: 0.000005
2023

100%|████████████████████████████████████████████████████████████████████████████| 479/479 [12:42<00:00,  1.59s/it]

2023-04-13 01:42:22,729 Evaluating as a multi-label problem: False
2023-04-13 01:42:22,741 DEV : loss 1.5603611469268799 - f1-score (micro avg)  0.0
2023-04-13 01:42:22,775 ----------------------------------------------------------------------------------------------------





2023-04-13 02:00:00,646 epoch 3 - iter 208/2081 - loss 2.01750472 - time (sec): 1057.87 - samples/sec: 24.46 - lr: 0.000005
2023-04-13 02:17:14,810 epoch 3 - iter 416/2081 - loss 1.99286675 - time (sec): 2092.03 - samples/sec: 25.14 - lr: 0.000005
2023-04-13 02:34:27,527 epoch 3 - iter 624/2081 - loss 1.98089556 - time (sec): 3124.75 - samples/sec: 25.06 - lr: 0.000005
2023-04-13 02:52:27,435 epoch 3 - iter 832/2081 - loss 1.96851395 - time (sec): 4204.66 - samples/sec: 25.49 - lr: 0.000005
2023-04-13 03:09:41,414 epoch 3 - iter 1040/2081 - loss 1.95118349 - time (sec): 5238.64 - samples/sec: 25.44 - lr: 0.000005
2023-04-13 03:26:55,126 epoch 3 - iter 1248/2081 - loss 1.93518053 - time (sec): 6272.35 - samples/sec: 25.38 - lr: 0.000005
2023-04-13 03:44:08,409 epoch 3 - iter 1456/2081 - loss 1.92197030 - time (sec): 7305.63 - samples/sec: 25.41 - lr: 0.000005
2023-04-13 04:01:13,795 epoch 3 - iter 1664/2081 - loss 1.90935814 - time (sec): 8331.02 - samples/sec: 25.38 - lr: 0.000005
2023

100%|████████████████████████████████████████████████████████████████████████████| 479/479 [12:47<00:00,  1.60s/it]

2023-04-13 04:48:31,933 Evaluating as a multi-label problem: False
2023-04-13 04:48:31,946 DEV : loss 1.327837586402893 - f1-score (micro avg)  0.0
2023-04-13 04:48:31,963 ----------------------------------------------------------------------------------------------------





2023-04-13 05:05:56,491 epoch 4 - iter 208/2081 - loss 1.69775915 - time (sec): 1044.53 - samples/sec: 26.24 - lr: 0.000005
2023-04-13 05:23:30,837 epoch 4 - iter 416/2081 - loss 1.69406585 - time (sec): 2098.87 - samples/sec: 25.37 - lr: 0.000005
2023-04-13 05:40:51,874 epoch 4 - iter 624/2081 - loss 1.67506003 - time (sec): 3139.91 - samples/sec: 25.38 - lr: 0.000005
2023-04-13 05:58:13,915 epoch 4 - iter 832/2081 - loss 1.65080314 - time (sec): 4181.95 - samples/sec: 25.45 - lr: 0.000005
2023-04-13 06:15:33,408 epoch 4 - iter 1040/2081 - loss 1.64316478 - time (sec): 5221.44 - samples/sec: 25.38 - lr: 0.000005
2023-04-13 06:33:10,052 epoch 4 - iter 1248/2081 - loss 1.62819633 - time (sec): 6278.09 - samples/sec: 25.44 - lr: 0.000005
2023-04-13 06:50:22,334 epoch 4 - iter 1456/2081 - loss 1.61147835 - time (sec): 7310.37 - samples/sec: 25.26 - lr: 0.000005
2023-04-13 07:07:35,866 epoch 4 - iter 1664/2081 - loss 1.59662599 - time (sec): 8343.90 - samples/sec: 25.32 - lr: 0.000005
2023

100%|████████████████████████████████████████████████████████████████████████████| 479/479 [12:48<00:00,  1.60s/it]

2023-04-13 07:55:08,130 Evaluating as a multi-label problem: False
2023-04-13 07:55:08,138 DEV : loss 1.073560357093811 - f1-score (micro avg)  0.0
2023-04-13 07:55:08,172 ----------------------------------------------------------------------------------------------------





2023-04-13 08:12:38,855 epoch 5 - iter 208/2081 - loss 1.42615114 - time (sec): 1050.68 - samples/sec: 24.90 - lr: 0.000004
2023-04-13 08:29:58,391 epoch 5 - iter 416/2081 - loss 1.40674360 - time (sec): 2090.22 - samples/sec: 25.05 - lr: 0.000004
2023-04-13 08:47:19,465 epoch 5 - iter 624/2081 - loss 1.38206853 - time (sec): 3131.29 - samples/sec: 25.21 - lr: 0.000004
2023-04-13 09:04:30,536 epoch 5 - iter 832/2081 - loss 1.36807796 - time (sec): 4162.36 - samples/sec: 25.15 - lr: 0.000004
2023-04-13 09:21:55,519 epoch 5 - iter 1040/2081 - loss 1.35751983 - time (sec): 5207.35 - samples/sec: 25.15 - lr: 0.000004
2023-04-13 09:39:26,539 epoch 5 - iter 1248/2081 - loss 1.35280917 - time (sec): 6258.37 - samples/sec: 25.40 - lr: 0.000004
2023-04-13 09:56:44,542 epoch 5 - iter 1456/2081 - loss 1.34392744 - time (sec): 7296.37 - samples/sec: 25.41 - lr: 0.000004
2023-04-13 10:13:48,398 epoch 5 - iter 1664/2081 - loss 1.33398492 - time (sec): 8320.23 - samples/sec: 25.50 - lr: 0.000004
2023

100%|████████████████████████████████████████████████████████████████████████████| 479/479 [12:39<00:00,  1.59s/it]

2023-04-13 11:00:43,344 Evaluating as a multi-label problem: False
2023-04-13 11:00:43,360 DEV : loss 0.9133843183517456 - f1-score (micro avg)  0.0
2023-04-13 11:00:43,376 ----------------------------------------------------------------------------------------------------





2023-04-13 11:18:14,633 epoch 6 - iter 208/2081 - loss 1.20566480 - time (sec): 1051.26 - samples/sec: 25.31 - lr: 0.000004
2023-04-13 11:35:28,594 epoch 6 - iter 416/2081 - loss 1.19668491 - time (sec): 2085.22 - samples/sec: 25.46 - lr: 0.000004
2023-04-13 11:52:54,586 epoch 6 - iter 624/2081 - loss 1.19965570 - time (sec): 3131.21 - samples/sec: 25.72 - lr: 0.000004
2023-04-13 12:09:47,920 epoch 6 - iter 832/2081 - loss 1.19452822 - time (sec): 4144.54 - samples/sec: 25.64 - lr: 0.000004
2023-04-13 12:27:38,963 epoch 6 - iter 1040/2081 - loss 1.18402255 - time (sec): 5215.59 - samples/sec: 25.25 - lr: 0.000004
2023-04-13 12:44:58,913 epoch 6 - iter 1248/2081 - loss 1.17920197 - time (sec): 6255.54 - samples/sec: 25.24 - lr: 0.000004
2023-04-13 13:02:22,911 epoch 6 - iter 1456/2081 - loss 1.17108408 - time (sec): 7299.53 - samples/sec: 25.18 - lr: 0.000004
2023-04-13 13:20:04,168 epoch 6 - iter 1664/2081 - loss 1.16540070 - time (sec): 8360.79 - samples/sec: 25.18 - lr: 0.000004
2023

100%|████████████████████████████████████████████████████████████████████████████| 380/380 [10:31<00:00,  1.66s/it]

2023-04-13 13:39:28,534 Evaluating as a multi-label problem: False
2023-04-13 13:39:28,541 0.0	0.0	0.0	0.0
2023-04-13 13:39:28,541 
Results:
- F-score (micro) 0.0
- F-score (macro) 0.0
- Accuracy 0.0

By class:
              precision    recall  f1-score   support

         ORG     0.0000    0.0000    0.0000    1400.0
         LOC     0.0000    0.0000    0.0000    1084.0
         PER     0.0000    0.0000    0.0000     735.0
        MISC     0.0000    0.0000    0.0000     340.0

   micro avg     0.0000    0.0000    0.0000    3559.0
   macro avg     0.0000    0.0000    0.0000    3559.0
weighted avg     0.0000    0.0000    0.0000    3559.0

2023-04-13 13:39:28,541 ----------------------------------------------------------------------------------------------------





{'test_score': 0.0,
 'dev_score_history': [0.0004581901489117984, 0.0, 0.0, 0.0, 0.0],
 'train_loss_history': [2.5066015035236564,
  2.1494552411330825,
  1.88082416773185,
  1.5650048094063327,
  1.3150867901417596],
 'dev_loss_history': [1.8872603178024292,
  1.5603611469268799,
  1.327837586402893,
  1.073560357093811,
  0.9133843183517456]}