In [31]:
import pandas as pd

In [32]:
chinese_dict_path = "../data/xinhua2.csv"

In [33]:
data = pd.read_csv(chinese_dict_path)

清洗数据集
1. 去掉无用的列

In [34]:
clean_data = data.drop(columns=['Unnamed: 0'])

In [35]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340290 entries, 0 to 340289
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   word        340290 non-null  object
 1   definition  340290 non-null  object
dtypes: object(2)
memory usage: 5.2+ MB


In [36]:
clean_data.describe()

Unnamed: 0,word,definition
count,340290,340290
unique,264037,303363
top,青龙,象声词。
freq,22,360


查看有没有重复数据

In [37]:
assert len(clean_data[clean_data.duplicated()]) == 0

设计dataset

In [38]:
clean_data['length'] = clean_data.definition.str.len()

In [39]:
clean_data.describe()

Unnamed: 0,length
count,340290.0
mean,16.022037
std,25.230879
min,1.0
25%,5.0
50%,8.0
75%,14.0
max,968.0


设计 dataset 类，根据释义的长度构建数据集。

判断词语的高低频

判断原始的 bert 的 embeddings 对于中文的高低频。

In [40]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")

model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-bert-wwm-ext")

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


高频词汇应该具有较长的模值
高频词汇如何获取：
1. 通过bert的vocab先看看有多少词汇

In [41]:
model.base_model.embeddings.word_embeddings.weight.shape

torch.Size([21128, 768])

In [42]:
tokenizer.vocab_files_names

{'vocab_file': 'vocab.txt', 'tokenizer_file': 'tokenizer.json'}

In [43]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("我爱北京。"))

['[CLS]', '我', '爱', '北', '京', '。', '[SEP]']

构建用于训练的dataset
1. dataset 需要一个 word id 用来指示在 tokens 中的位置。

In [44]:
xinhua_dict = "../data/xinhua2.csv"

In [45]:
import datasets

In [46]:
xinhua_dataset = datasets.load_dataset('csv', data_files=[xinhua_dict]).remove_columns('Unnamed: 0')

Using custom data configuration default-405a8ea61a95b513
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-405a8ea61a95b513/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [50]:
xinhua_dataset

DatasetDict({
    train: Dataset({
        features: ['word', 'definition'],
        num_rows: 340290
    })
})

In [55]:
# filter dataset
# 去掉有unknown的单词, 去掉所有不是中文的word
vocab = tokenizer.vocab
def filter_unk(x):
    for word in x.values():
        for ch in word:
            if ch not in vocab:
                return False
    return True

# test
filter_unk({'word':"你好"})

True

In [57]:
xinhua_dataset = xinhua_dataset.filter(filter_unk)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-405a8ea61a95b513/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-27275ebe44a8b2a7.arrow


In [65]:
def map_function(example):
    definition = example['definition']
    inputs = tokenizer(definition)
    word = example['word']
    word_ids = tokenizer.convert_tokens_to_ids([*word])
    inputs['word_ids'] = word_ids
    return inputs

xinhua_dataset.map(map_function)

  0%|          | 0/284976 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['word', 'definition', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 284976
    })
})