In [1]:
from transformers import AutoModel, AutoTokenizer

In [2]:
checkpoint = "hfl/chinese-bert-wwm-ext"

In [3]:
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
model.embeddings.word_embeddings

Embedding(21128, 768, padding_idx=0)

In [6]:
tokenizer.vocab

{'##涡': 16937,
 '碴': 4824,
 '腑': 5577,
 'ー': 645,
 '##ol': 8798,
 '潜': 4052,
 '##縛': 18293,
 'ic': 8577,
 '##り': 8506,
 '##ston': 10229,
 'hr': 8967,
 '##明': 16266,
 '##理': 17472,
 '##揩': 16054,
 '##闹': 20374,
 '桩': 3445,
 '╠': 442,
 '##0mm': 12483,
 '飚': 7605,
 '又': 1348,
 '2004': 8258,
 '呂': 1436,
 '彧': 2505,
 '镌': 7254,
 '内': 1079,
 '##uan': 9680,
 '##can': 12632,
 'ᆯ': 323,
 '架': 3373,
 '##刎': 14208,
 '徬': 2543,
 '沅': 3754,
 '矢': 4759,
 'email': 8307,
 '##45': 9039,
 '簿': 5089,
 '鐳': 7135,
 '##卟': 14360,
 '##怦': 15651,
 '##蛭': 19093,
 '黝': 7952,
 '##之': 13779,
 '##匯': 14331,
 '##敛': 16194,
 '##鋁': 20136,
 '##飢': 20666,
 '##．': 21082,
 '来': 3341,
 '碗': 4813,
 '坑': 1778,
 '##搜': 16074,
 '念': 2573,
 '##甚': 17550,
 '##鸿': 20953,
 '##汀': 16779,
 '顺': 7556,
 '##秧': 17970,
 '##希': 15418,
 '##衞': 19186,
 '##鏢': 20186,
 '焊': 4184,
 '##舉': 18704,
 '籁': 5090,
 '驥': 7715,
 '來': 889,
 '##甬': 17560,
 '##镍': 20312,
 '痢': 4581,
 '宕': 2133,
 '绥': 5324,
 'ing': 10139,
 '##擼': 16157,
 '##曙': 16339,
 

In [91]:
import pandas as pd
xinhua_words = pd.read_csv("../data/xinhua2.csv")

xinhua_words = xinhua_words.word.values

xinhua_words = set(xinhua_words)

vocab = tokenizer.vocab

vocab = set(vocab.keys())

clean_xinhua = []

def all_true(word):
    for ch in word:
        if ch not in vocab:
            return False
    return True

for word in xinhua_words:
    if all_true(word):
        clean_xinhua.append(word)
len(clean_xinhua)

234095

新华词典 common words

In [38]:
len(clean_xinhua)

264037

Make a Chinese Word Embeddings dict

In [73]:
def get_chinese_word_embeddings(word, embeddings, tokenizer):
    word_ids = tokenizer.convert_tokens_to_ids([*word])
    for word_id in word_ids:
        assert word_id != 100, word
    embed = embeddings[word_ids].mean(axis=0)
    return embed

# test
assert get_chinese_word_embeddings("中国", model.embeddings.word_embeddings.weight, tokenizer).size()[0] == 768

In [127]:
tokenizer.convert_tokens_to_ids([*clean_xinhua[0]])

[1538, 7556, 722]

In [132]:
xinhua_embeddings = {}
for word in clean_xinhua:
    xinhua_embeddings[word] = get_chinese_word_embeddings(word, model.embeddings.word_embeddings.weight, tokenizer)


In [95]:
def cosine_recall(example_embedding, original_embeddings, K=10):
    # calculate cosine distance
    cosine_sim = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
    indexes = []
    distances = []
    for embed in example_embedding:
        cosine_similarity_of_example = cosine_sim(original_embeddings, embed)
        index = cosine_similarity_of_example.argsort(descending=True)[1:K + 1]
        distance = cosine_similarity_of_example[index]
        # print("cosine distance: ", cosine_similarity_of_example[index])
        indexes.append(index)
        distances.append(distance)
    return torch.stack(indexes), torch.stack(distances)

In [96]:
def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [102]:
import torch

In [151]:
def recall_nearest_k(input_word, embed_dict, k=10):
    embed = embed_dict[input_word]
    nearest = float('-inf')
    res = ""
    count = 0 
    for word, value in embed_dict.items():
        distance = torch.nn.functional.cosine_similarity(embed.unsqueeze(0), value.unsqueeze(0))
        if distance > nearest:
            res = word
            nearest = distance
            print(word)
            if word == input_word:
                break
    return res
# test
recall_nearest_k('作业', xinhua_embeddings)

唐顺之
略传
信节
乡壁虚造
口业
实业
业行
作
作业


'作业'