# Tasks
- [x] 提取模长比较短的词.

In [1]:
from transformers import AutoModel

In [2]:
chekpoint = "bert-base-uncased"

In [3]:
model = AutoModel.from_pretrained(chekpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

## 为什么word embedding的输入是30522? 回答: 是字典的长度

In [5]:
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(chekpoint)

In [7]:
text = "I'm using bert."

In [8]:
tokenizer(text)

{'input_ids': [101, 1045, 1005, 1049, 2478, 14324, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
len(tokenizer.vocab)

30522

词典的个数就是 word_embeddings 的深度.

In [10]:
inputs = tokenizer(text)

In [11]:
tokenizer.save_vocabulary("./")

('./vocab.txt',)

# 拿出norm的值比较低的看一下.
1. 计算所有的vocabulary的norm的值.
2. 排序之后查看norm最大的10个对应的单词.

In [12]:
model.embeddings.state_dict().keys()

odict_keys(['position_ids', 'word_embeddings.weight', 'position_embeddings.weight', 'token_type_embeddings.weight', 'LayerNorm.weight', 'LayerNorm.bias'])

In [13]:
embeddings_weights = model.embeddings.state_dict()['word_embeddings.weight']

In [14]:
# calculate norms for embeddings.
words_norms = embeddings_weights.norm(dim=1)

In [47]:
# topk
K=20
indexes = words_norms.argsort(descending=True)[:K]
# change tokens 
tokens = tokenizer.convert_ids_to_tokens(indexes)
print(f"There are {len(tokens)} with longest norms.")
tokenizer.convert_tokens_to_string(tokens)
for token in tokens:
    print(tokenizer.convert_tokens_to_string([token]))

There are 20 with longest norms.
670
##omba
##rdon
[CLS]
##anor
##lho
840
##lland
930
690
##onte
740
##dna
910
570
381
##gall
##ango
##ibar
##oles


In [48]:
# topk
K=20
indexes = words_norms.argsort(descending=False)[:K]
# change tokens 
tokens = tokenizer.convert_ids_to_tokens(indexes)
print(f"There are {len(tokens)} with shortest norms.")
tokenizer.convert_tokens_to_string(tokens)
for token in tokens:
    print(tokenizer.convert_tokens_to_string([token]))

There are 20 with shortest norms.
[SEP]
.
;
the
,
of
his
(
in
her
"
a
is
several
at
john
william
with
was
him


更长的模长的单词确实都不知道是什么意思, 除了“[CLS]”这个token, 最短的模长的单词都是常见词.

模长 | 意思
-- | -- 
长 | 看不懂
短 | 常见词

- [x] TODO: 找到50个低频词汇和字典的交集
不需要作这一步, 这一步是循环论证了.

# 构建训练数据集
构建数据集需要在已有的vocabulary里面找到在wordnet对应的单词.
1. wordnet 和 bert 的 vocabulary 求交集.
2. 求完交集之后的单词列表再对 imdb 数据集筛选.

In [50]:
from typing import Counter
from datasets import load_dataset


def word_dict() -> dict:
    data_files = "/diskb/houbowei/clever_nlp/core-wordnet.txt"

    def _parse_words(words: list):
        word = words[1]
        word = word.strip("[]")
        explanation = ' '.join(words[2:]).strip()
        return word, explanation

    wordnet = {}
    with open(data_files, 'r') as f:
        text = f.readline()
        while text:
            words = text.split()
            assert len(words) >= 3, f"{words}"
            word, explanation = _parse_words(words[1:])
            if explanation and word:
                wordnet[word] = explanation
            text = f.readline()
        print("DONE")
    return wordnet


imdb_dataset = load_dataset("imdb")

Reusing dataset imdb (/diskb/houbowei/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [51]:
wordnet = word_dict()

DONE


In [55]:
words_wordnet = list(wordnet.keys())

In [60]:
words_bert = list(tokenizer.vocab.keys())

In [70]:
print(f"there are {len(words_wordnet)} in wordnet")

there are 3266 in wordnet


In [72]:
print(f"there are {len(words_bert)} in bert.")

there are 30522 in bert.


In [68]:
common_words = set(words_bert) & set(words_wordnet)

In [73]:
print(f"there are {len(common_words)} in wordnet and bert")

there are 3044 in wordnet and bert


- [ ] 由双方共有的单词表来构建训练数据集

In [76]:
imdb_dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

- [ ] 寻找common words里面的低频词汇

In [95]:
# words -> tokens --> norms
common_words_norms = {}
for word in common_words:
    token = tokenizer.tokenize(word)
    i = tokenizer.convert_tokens_to_ids(token)
    common_words_norms[word] = words_norms[i].numpy()

上一步得到了单词表里面的单词的词频, 下一步需要计算下模长最长的单词是啥.

In [104]:
jjj

{'gallon': array([1.6540791], dtype=float32),
 'sock': array([1.6361235], dtype=float32),
 'wrestle': array([1.6341456], dtype=float32),
 'shave': array([1.623518], dtype=float32),
 'devote': array([1.6151217], dtype=float32),
 'plead': array([1.5917], dtype=float32),
 'kettle': array([1.5911119], dtype=float32),
 'preach': array([1.5900376], dtype=float32),
 'weigh': array([1.5883232], dtype=float32),
 'spoil': array([1.5873092], dtype=float32),
 'tread': array([1.5854286], dtype=float32),
 'owe': array([1.5769775], dtype=float32),
 'bracket': array([1.5668452], dtype=float32),
 'scramble': array([1.5654179], dtype=float32),
 'courtesy': array([1.5640211], dtype=float32),
 'casualty': array([1.5632261], dtype=float32),
 'vain': array([1.5607172], dtype=float32),
 'appendix': array([1.5547643], dtype=float32),
 'chew': array([1.5537211], dtype=float32),
 'coma': array([1.5535696], dtype=float32),
 'deprivation': array([1.5535201], dtype=float32),
 'budge': array([1.5510107], dtype=floa

owe确实在wordnet里面, 不过这些单词我都认识呀....不过这样也确实证明了几个事情
1. embedding的模长没有取错.

现在拥有了一个低频(低模)的词典, 并且这个词典里面的词都是bert的语料里面出现过的.
现在我需要训练一个网络, 让这些低频词的模长发生变化, 集体变小.

- [ ] TODO: 利用wordnet的字典做一个recnn的net.
- [ ] 利用pandas做一个transformer的数据集, 先做train, 里面包含 features 是 [explanation] [word] [embeddings]

In [140]:
# Select common words from wordnet, common words is the word in bert.
common_words_dict = {k:v for k, v in wordnet.items() if k in common_words}

In [165]:
import pandas as pd
wordnet_series = pd.DataFrame({'word':common_words_dict.keys(), 'explanation':common_words_dict.values()}, index=None)

In [168]:
# change series to transformer's dataset
# 1. create txt wordnet
wordnet_series.to_csv("wordnet.csv", index=None)

In [199]:
wordnet_dataset = load_dataset('csv', data_files="wordnet.csv")

Using custom data configuration default-6904515b038bdcdf
Reusing dataset csv (/diskb/houbowei/.cache/huggingface/datasets/csv/default-6904515b038bdcdf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

# 利用数据集训练网络
- [ ] 构建一个网络

In [279]:
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertModel, BertTokenizer


class DictNet(nn.Module):
    """Some Information about CleverNLP"""
    def __init__(self, model='bert-base-uncased', device='cuda'):
        super(DictNet, self).__init__()
        self.device = device
        self.bert = BertModel.from_pretrained(model)
        self.recnn = torch.nn.Sequential(nn.Linear(768, 768),
                                         nn.Linear(768, 768),
                                         nn.Linear(768, 768))

    def mean_pooling(self, model_output):
        return model_output.mean(axis=0)

    def forward(self, explanation):
        print(explanation)
        with torch.no_grad():
            explanation = self.bert(**explanation)[0][0]
        recnn_output = self.recnn(explanation)
        pred_embed = self.mean_pooling(recnn_output)
        return pred_embed

model = DictNet()

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /diskb/houbowei/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin f

In [280]:
# test single input
single_input = tokenizer("test", return_tensors='pt')
model(single_input).shape

{'input_ids': tensor([[ 101, 3231,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}


torch.Size([768])

- [ ] test multi inputs(outputs is wrong, should be batched.)
``` python
mul_inputs = tokenizer(["I am a bird."], ["He is a chicken."], return_tensors='pt')
model(mul_inputs).shape
```

In [348]:
%pdb

Automatic pdb calling has been turned OFF


In [310]:
embeddings_weights.shape

torch.Size([30522, 768])

In [312]:
tokenizer.convert_tokens_to_ids('owl')

13547

In [375]:
# batched inputs
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

def tokenize_function(example):
    res = tokenizer(example['explanation'], truncation=True, padding=True)
    res['word_ids']=tokenizer.convert_tokens_to_ids(example['word'])
    return res

tokenized_dataset = wordnet_dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(['explanation', 'word'])

data_collator([tokenized_dataset['train'][i] for i in range(8)])

Loading cached processed dataset at /diskb/houbowei/.cache/huggingface/datasets/csv/default-6904515b038bdcdf/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-456a1c2a22a7d084.arrow


{'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'input_ids': tensor([[  101,  5214,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  3819,  2030,  3143,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  4493,  2069,  1999,  1996,  2568,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101, 20228,  4765, 18424,   102,     0,     0,     0,

In [377]:
for k in tokenized_dataset['train']:
    assert len(k['input_ids']) > 0

In [378]:
tokenized_dataset['train'][904]

{'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'input_ids': [101, 3013, 2460, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'word_ids': 10416}

In [379]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="recnn_test",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
#     data_collator=data_collator
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `DictNet.forward` and have been ignored: word_ids, attention_mask, token_type_ids, input_ids.
***** Running training *****
  Num examples = 0
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 48


IndexError: Invalid key: 904 is out of bounds for size 0

In [380]:
from torch.utils.data import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, batch_size=8)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

from transformers import get_scheduler

num_epochs =3
num_train_step = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_step
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_train_step))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1143 [00:00<?, ?it/s]

RuntimeError: stack expects each tensor to be equal size, but got [21] at entry 0 and [18] at entry 1