In [1]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

import jieba

import torch

from transformers import AutoTokenizer, AutoModel
# checkpoint = "bert-base-uncased"
checkpoint = "hfl/chinese-bert-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
word_embeddings = model.base_model.embeddings.word_embeddings.weight

计算按照单词的tokens来分词的embeddings的相似度，以及通过模型前向之后的embeddings的相似度

```python
def take_ids_from_bert_input(tokens, positions)->embeddings
def take_ids_from_bert_output(tokens, positions)->embeddings
def cosine_similarity(embeddings)
```

In [3]:
def cut_sentence_into_words(sentence)->list:
    """对句子分词
    
    注意：英文也会作为一个词组被分出来。
    """
    words = jieba.cut(sentence)
    words = filter(lambda x: len(x) > 1, words)
    return list(set(words))
# test
cut_sentence_into_words("我爱北京北京北京天安门")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.643 seconds.
Prefix dict has been built successfully.


['北京', '天安门']

In [4]:
def get_words_position_in_tokens(tokens, word):
    length = len(word)
    positions = []
    for character in word:
        for position, token in enumerate(tokens):
            if character == token:
                positions.append(position)
                break
    assert len(word) == len(positions), f"length of {word} in {tokens} not equal to {positions}"
    return positions
# test
get_words_position_in_tokens("I love 我北京北京北京天安门北京。", "天安门")

[14, 15, 16]

In [5]:
def is_chinese(word):
    for ch in word:
        if not (u'\u4e00' <= ch <= u'\u9fff'):
            return False
    return True

In [6]:
def get_tokens_positions(sentence)->tuple:
    """提取中文词汇在sentence转变成tokens后的position位置。

    输入：
       sentence（str） ：我爱北京天安门。
    输出：
        （word, word_ids, word_positions）
    """
    words = cut_sentence_into_words(sentence)
    tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sentence))

    for word in words:
        if is_chinese(word):
            word_ids = tokenizer.convert_tokens_to_ids([*word])
            word_positions = get_words_position_in_tokens(tokens, word)
            yield word, word_ids, word_positions
# test
list(get_tokens_positions('I love china, 我爱北京北京天安门。'))

[('北京', [1266, 776], [7, 8]), ('天安门', [1921, 2128, 7305], [11, 12, 13])]

由 get_tokens_positons 得到原始 embeddings 中的单词的的位置，以及在 tokens 中的位置，分别用于 bert 的 word embeddings 和输出的 logits 之间的判断。

In [7]:
def calculate_distance(a, b):
    return cosine_distances(a, b)

In [8]:
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [9]:
sentence = "我爱北京天安门。"
outpus = model(**tokenizer(sentence, return_tensors='pt'))[0][0]
for word, word_ids, token_positions in get_tokens_positions(sentence):
    print(word, "\n")
    input_embeddings = word_embeddings[word_ids]
    output_embeddings = outpus[token_positions]
    print(sim_matrix(input_embeddings, input_embeddings))
    print(sim_matrix(output_embeddings, output_embeddings))

北京 

tensor([[1.0000, 0.3443],
        [0.3443, 1.0000]], grad_fn=<MmBackward0>)
tensor([[1.0000, 0.8295],
        [0.8295, 1.0000]], grad_fn=<MmBackward0>)
天安门 

tensor([[1.0000, 0.0905, 0.0704],
        [0.0905, 1.0000, 0.0806],
        [0.0704, 0.0806, 1.0000]], grad_fn=<MmBackward0>)
tensor([[1.0000, 0.7364, 0.6571],
        [0.7364, 1.0000, 0.7519],
        [0.6571, 0.7519, 1.0000]], grad_fn=<MmBackward0>)


结论： 单单对于天安门，北京这两个词，训练之后的语意是相近的。

TODO list
- [ ] 对中文切词的时候，可能存在UNK token。

对于中文新华词典数据集里面的所有definition，计算一下 cosine similarity

In [10]:
import datasets

In [11]:
dataset = datasets.load_dataset("csv", data_files=["../data/xinhua2.csv"])

Using custom data configuration default-405a8ea61a95b513
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-405a8ea61a95b513/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
dataset = dataset.remove_columns('Unnamed: 0')

- [x] 对数据集塞选，保证 definition 里面的词必须出现在 tokenizer.vacab 里面
- [x] 对数据集塞选，保证 word 里面的词必须出现在 tokenizer.vacab 里面

In [19]:
vocab = tokenizer.vocab
def filter_function(x):
    for word in x.values():
        for ch in word:
            if ch not in vocab:
                return False
    return True
# test
assert filter_function({"word": "我们", "definition": "我们几个."}) 
assert filter_function({"word": "我们", "definition": "我们衽个."}) is False

In [16]:
dataset = dataset.filter(filter_function)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-405a8ea61a95b513/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-eaf1b622cd38c32f.arrow


保证完之后，只剩下 284976 个词条。

In [73]:
for sentence in dataset['train']['definition']:
    print(sentence)
    outpus = model(**tokenizer(sentence, return_tensors='pt'))[0][0]
    for word, word_ids, token_positions in get_tokens_positions(sentence):
        print(word)
        input_embeddings = word_embeddings[word_ids]
        output_embeddings = outpus[token_positions]
        input_sim = sim_matrix(input_embeddings, input_embeddings)
        print(input_sim)
        output_sim = sim_matrix(output_embeddings, output_embeddings)
        print(output_sim)
        # if ((output_sim+0.00001) < input_sim).sum() != 0:
            # print(word)
    break

帝王的诏书﹑制令。
诏书
tensor([[1.0000, 0.1497],
        [0.1497, 1.0000]], grad_fn=<MmBackward0>)
tensor([[1.0000, 0.7741],
        [0.7741, 1.0000]], grad_fn=<MmBackward0>)
帝王
tensor([[1.0000, 0.3293],
        [0.3293, 1.0000]], grad_fn=<MmBackward0>)
tensor([[1.0000, 0.8300],
        [0.8300, 1.0000]], grad_fn=<MmBackward0>)
制令
tensor([[1.0000, 0.1327],
        [0.1327, 1.0000]], grad_fn=<MmBackward0>)
tensor([[1.0000, 0.6989],
        [0.6989, 1.0000]], grad_fn=<MmBackward0>)


还需要验证一点：是不是只要是经过 bert 的 tokens 的距离都会拉近。

设计实验：
1. 判断是不是只要经过bert，向量的距离就会变近
2. 输入：任意definition，任意token。

In [86]:
test_case = "中国天安门。"
inputs = tokenizer(test_case, return_tensors='pt')
print(tokenizer.convert_ids_to_tokens(*inputs['input_ids']))
output_embed = model(**inputs)[0][0]

input_ids = tokenizer.encode(test_case)

input_embed = word_embeddings[input_ids]

['[CLS]', '中', '国', '天', '安', '门', '。', '[SEP]']


In [87]:
sim_matrix(input_embed, input_embed).detach().cpu().numpy()

array([[ 1.0000001 ,  0.01460356,  0.09307431,  0.01304194, -0.05687577,
        -0.02262287,  0.34448862,  0.5507547 ],
       [ 0.01460356,  0.99999964,  0.16015863,  0.12534174,  0.06934987,
         0.03584412,  0.16264743,  0.07302652],
       [ 0.09307431,  0.16015863,  1.0000008 ,  0.09747883,  0.05757412,
         0.1834615 ,  0.12305939,  0.10989287],
       [ 0.01304194,  0.12534174,  0.09747883,  0.9999992 ,  0.09052134,
         0.07040579,  0.08258034,  0.04834021],
       [-0.05687577,  0.06934987,  0.05757412,  0.09052134,  0.9999995 ,
         0.08060817,  0.02444113, -0.00397609],
       [-0.02262287,  0.03584412,  0.1834615 ,  0.07040579,  0.08060817,
         1.0000001 ,  0.01778734,  0.02921772],
       [ 0.34448862,  0.16264743,  0.12305939,  0.08258034,  0.02444113,
         0.01778734,  0.99999976,  0.36590734],
       [ 0.5507547 ,  0.07302652,  0.10989287,  0.04834021, -0.00397609,
         0.02921772,  0.36590734,  1.0000006 ]], dtype=float32)

In [88]:
sim_matrix(output_embed, output_embed).detach().numpy()

array([[0.99999976, 0.47359958, 0.46130764, 0.4001449 , 0.36885065,
        0.5084616 , 0.6717285 , 0.62997663],
       [0.47359958, 1.0000005 , 0.72016007, 0.57480735, 0.57306653,
        0.55845803, 0.52736896, 0.39728093],
       [0.46130764, 0.72016007, 1.0000002 , 0.5515194 , 0.6074523 ,
        0.58084834, 0.56016934, 0.39864472],
       [0.4001449 , 0.57480735, 0.5515194 , 1.0000005 , 0.7175662 ,
        0.63551086, 0.51203763, 0.35867494],
       [0.36885065, 0.57306653, 0.6074523 , 0.7175662 , 0.9999999 ,
        0.7296189 , 0.4712883 , 0.37413248],
       [0.5084616 , 0.55845803, 0.58084834, 0.63551086, 0.7296189 ,
        0.9999994 , 0.6280204 , 0.47663143],
       [0.6717285 , 0.52736896, 0.56016934, 0.51203763, 0.4712883 ,
        0.6280204 , 0.9999999 , 0.51364434],
       [0.62997663, 0.39728093, 0.39864472, 0.35867494, 0.37413248,
        0.47663143, 0.51364434, 1.0000005 ]], dtype=float32)

从这个例子看, 经过 bert token 之间的距离都变近了，这个是合理的，距离最近的是 中， 国两个token。