# 实验：观测 bert 对 embeddings 模长的影响。

实验步骤：
- [x] 所有的definiton取出来。
- [x] definition进入bert。
- [x] 保存所有单词的输出。
- [ ] 对输出做统计。
- [ ] 得到所有单词的embeddings的norm
- [ ] 对比两个norm

## 取出所有 definition，构建 dataset

In [1]:
from dataset import build_dataset
dataset,tokenizer = build_dataset((0,2))

train_dataset = dataset['train']

# test
assert len(dataset['train']) > 0

Using custom data configuration default-61bcd1b6d4c45256
Reusing dataset csv (/diskb/houbowei/.cache/huggingface/datasets/csv/default-61bcd1b6d4c45256/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /diskb/houbowei/.cache/huggingface/datasets/csv/default-61bcd1b6d4c45256/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-b3e42a1771e2ddf3.arrow
Loading cached processed dataset at /diskb/houbowei/.cache/huggingface/datasets/csv/default-61bcd1b6d4c45256/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-87bb01b47c528e30.arrow


## definition 都进入 bert

In [2]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

collate_function = DataCollatorWithPadding(tokenizer)
dataloader = DataLoader(train_dataset, collate_fn=collate_function, batch_size=1024)

In [3]:
from transformers import AutoModel

In [4]:
bert = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
import pandas as pd
wordnet_data = pd.read_csv("./wordnet_bert_common_words.csv")

In [6]:
def words_to_definitions(words):
    global wordnet_data
    definitions = dict()
    for word in words:
        definitions[word] = str(wordnet_data[wordnet_data.words == word].definition.values).strip("[]''")
    return definitions

# test words_to_definition
words_to_definitions(["owl", "butte"])

{'owl': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes',
 'butte': 'a hill that rises abruptly from the surrounding region; has a flat top and sloping sides'}

In [14]:
import torch

word_pooler_embeddings_dict = dict()
word_hidden_embeddings_dict = dict()
bert.to('cuda')
bert.eval()

for inputs in dataloader:
    inputs = {k:v.to('cuda') for k, v in inputs.items()}
    word_ids = inputs.pop('word_ids')
    words = tokenizer.convert_ids_to_tokens(word_ids)
    with torch.no_grad():
        outputs = bert(**inputs)
        assert bert.training is False
    last_hidden_state = outputs['last_hidden_state']
    pooler_output = outputs['pooler_output']
    word_pooler_embeddings_dict.update(dict(zip(words, pooler_output.cpu().numpy())))
    word_hidden_embeddings_dict.update(dict(zip(words, last_hidden_state.mean(dim=1).cpu().numpy())))

In [17]:
# calculate norm
import numpy as np

9.197718

In [30]:
word_hidden_embeddings_df = pd.DataFrame(word_hidden_embeddings_dict).transpose()

In [31]:
word_pooler_embeddings_df = pd.DataFrame(word_pooler_embeddings_dict).transpose()

In [50]:
def calculate_norms(word_dict:dict):
    d = dict(zip(word_dict.keys(), map(lambda x: np.linalg.norm(x), word_dict.values())))
    df = pd.DataFrame(d.items(), columns=['word', 'norm'])
    return df

In [52]:
# test calculate norms
hidden_df = calculate_norms(word_hidden_embeddings_dict)

In [53]:
pooler_df = calculate_norms(word_pooler_embeddings_dict)

In [57]:
# merge two norms
hidden_pooler_df = pd.merge(hidden_df, pooler_df, on='word', suffixes=['_hidden', '_pooler'])

In [58]:
hidden_pooler_df.sample(10)

Unnamed: 0,word,norm_hidden,norm_pooler
8564,magazine,8.653953,18.064882
10833,happily,8.552767,14.256584
11565,boer,8.501917,21.474812
3869,garbage,8.859794,18.211678
13840,liturgy,8.698764,18.120737
5831,spat,9.221212,14.954306
2519,cologne,8.350233,18.897917
4683,spencer,8.036829,21.776302
10604,gubernatorial,10.012076,17.228092
13775,unfortunate,8.956439,15.803585


In [59]:
hidden_pooler_df.describe()

Unnamed: 0,norm_hidden,norm_pooler
count,14510.0,14510.0
mean,8.980854,17.567572
std,0.549578,1.974311
min,7.491759,12.548604
25%,8.593598,15.979658
50%,8.934441,17.657382
75%,9.307274,19.11482
max,12.957676,23.039471


使用 hidden 层而不是 pooler 层的输出

In [64]:
# get word embeddings from bert embeddings layer.
word_embeddings_dict = dict()

for inputs in dataloader:
    inputs = {k:v.to('cuda') for k, v in inputs.items()}
    word_ids = inputs.pop('word_ids')
    words = tokenizer.convert_ids_to_tokens(word_ids)
    embeddings = bert.embeddings.word_embeddings.state_dict()['weight'][word_ids]
    word_embeddings_dict.update(dict(zip(words, embeddings.detach().cpu().numpy())))

original_word_norm_df = calculate_norms(word_embeddings_dict)

In [67]:
res = pd.merge(original_word_norm_df, hidden_pooler_df, on='word', suffixes=['original', 'hidden'])

In [68]:
res.describe()

Unnamed: 0,norm,norm_hidden,norm_pooler
count,14510.0,14510.0,14510.0
mean,1.339519,8.980854,17.567572
std,0.151899,0.549578,1.974311
min,0.866191,7.491759,12.548604
25%,1.233141,8.593598,15.979658
50%,1.355623,8.934441,17.657382
75%,1.452249,9.307274,19.11482
max,1.823229,12.957676,23.039471
