In [19]:
from transformers import AutoModel, AutoTokenizer

In [20]:
checkpoint = "hfl/chinese-bert-wwm-ext"

In [21]:
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [23]:
model.embeddings.word_embeddings

Embedding(21128, 768, padding_idx=0)

In [24]:
tokenizer.vocab

{'举': 715,
 '371': 12584,
 '瑄': 4440,
 'archives': 12318,
 '麼': 7938,
 '##gion': 13258,
 '##秣': 17967,
 '##复': 14965,
 '##诋': 19460,
 '[CLS]': 101,
 '︶': 7994,
 '婷': 2051,
 '茼': 5766,
 '##yy': 12642,
 'bear': 13265,
 '乩': 742,
 '##τ': 13397,
 '##蚪': 19077,
 '[unused1]': 1,
 '▽': 469,
 '嗚': 1627,
 '##day': 8758,
 '墓': 1867,
 '6a': 11692,
 '琦': 4425,
 '闰': 7311,
 'bh': 11030,
 '6gb': 12324,
 'am09': 12377,
 'スタッフ': 9903,
 '基': 1825,
 '399': 9612,
 '##顷': 20611,
 '呋': 1441,
 '##note': 13062,
 '秦': 4912,
 '##劵': 14286,
 '##淦': 16968,
 '##ily': 11779,
 '##奄': 14992,
 '##癢': 17680,
 '拂': 2855,
 '筹': 5040,
 '##蛟': 19091,
 '湖': 3959,
 '险': 7372,
 '##摧': 16096,
 '丁': 672,
 '緞': 5223,
 '饪': 7647,
 '搵': 3023,
 'fgo': 11401,
 'positioning': 11187,
 'sao': 12053,
 'rainer': 13076,
 '伸': 847,
 '¥': 175,
 '1010': 13266,
 '##渭': 17005,
 '珀': 4391,
 '##訳': 19316,
 'ы': 257,
 '##州': 15393,
 'られた': 11512,
 '萦': 5853,
 '##囑': 14777,
 '涝': 3876,
 '##93': 9676,
 'r9': 12674,
 '卢': 1306,
 '啊': 1557,
 '##熵': 

In [25]:
import pandas as pd
xinhua_words = pd.read_csv("../data/xinhua2.csv")

xinhua_words = xinhua_words.word.values

xinhua_words = set(xinhua_words)

vocab = tokenizer.vocab

vocab = set(vocab.keys())

clean_xinhua = []

def all_true(word):
    for ch in word:
        if ch not in vocab:
            return False
    return True

for word in xinhua_words:
    if all_true(word):
        clean_xinhua.append(word)
len(clean_xinhua)

234095

新华词典 common words

In [26]:
len(clean_xinhua)

234095

Make a Chinese Word Embeddings dict

In [27]:
def get_chinese_word_embeddings(word, embeddings, tokenizer):
    word_ids = tokenizer.convert_tokens_to_ids([*word])
    for word_id in word_ids:
        assert word_id != 100, word
    embed = embeddings[word_ids].mean(axis=0)
    return embed

# test
assert get_chinese_word_embeddings("中国", model.embeddings.word_embeddings.weight, tokenizer).size()[0] == 768

In [28]:
tokenizer.convert_tokens_to_ids([*clean_xinhua[0]])

[7498, 6792]

In [29]:
xinhua_embeddings = {}
for word in clean_xinhua:
    xinhua_embeddings[word] = get_chinese_word_embeddings(word, model.embeddings.word_embeddings.weight, tokenizer)


In [30]:
def cosine_recall(example_embedding, original_embeddings, K=10):
    # calculate cosine distance
    cosine_sim = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
    indexes = []
    distances = []
    for embed in example_embedding:
        cosine_similarity_of_example = cosine_sim(original_embeddings, embed)
        index = cosine_similarity_of_example.argsort(descending=True)[1:K + 1]
        distance = cosine_similarity_of_example[index]
        # print("cosine distance: ", cosine_similarity_of_example[index])
        indexes.append(index)
        distances.append(distance)
    return torch.stack(indexes), torch.stack(distances)

In [31]:
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [32]:
import torch

In [33]:
import random
def recall_nearest_k(input_word, embed_dict, k=10):
    embed = embed_dict[input_word]
    nearest = float('-inf')
    res = ""
    count = 0 
    for word, value in embed_dict.items():
        distance = torch.nn.functional.cosine_similarity(embed.unsqueeze(0), value.unsqueeze(0))
        if word == input_word:
            continue
        if distance > nearest:
            res = word
            nearest = distance
            print(word, nearest)
       
    return res
# test
recall_nearest_k('女人', xinhua_embeddings)

鞭辟 tensor([-0.1029], grad_fn=<DivBackward0>)
月午 tensor([0.0878], grad_fn=<DivBackward0>)
暴兵 tensor([0.0974], grad_fn=<DivBackward0>)
笑影 tensor([0.1128], grad_fn=<DivBackward0>)
名察 tensor([0.1235], grad_fn=<DivBackward0>)
胆子 tensor([0.1319], grad_fn=<DivBackward0>)
时行病 tensor([0.1721], grad_fn=<DivBackward0>)
过子 tensor([0.1798], grad_fn=<DivBackward0>)
斗母元君 tensor([0.1989], grad_fn=<DivBackward0>)
遗女 tensor([0.5112], grad_fn=<DivBackward0>)
下女 tensor([0.6341], grad_fn=<DivBackward0>)
半男女 tensor([0.6487], grad_fn=<DivBackward0>)
女性 tensor([0.6489], grad_fn=<DivBackward0>)
小女 tensor([0.6535], grad_fn=<DivBackward0>)
女人拜 tensor([0.7528], grad_fn=<DivBackward0>)
男人 tensor([0.8264], grad_fn=<DivBackward0>)


'男人'

算法的问题在于，对于中文的词组，相近的含义变成了必须是包含有当前字的意思。如果只是简单的取平均，那么对于embeddings而言，这两个字对于词意的贡献是一样的

$$\text{embedding} = \frac{(\text{embedding}_a + \text{embedding}_b)}{2}$$

这样看来就是不合理的，需要重新调配每个 token 的 weight，所以对于中文而言，提取词语的 embeddings，应该是利用 bert 的最后一层 layer 的输出。