# 词向量代码实现

In [226]:
!python -V

Python 3.11.5


In [124]:
! pip install torch==2.4.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x2284d8230b0>

In [3]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)


In [227]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.
# Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
# Print the first 3, just so you can see what they look like.
print(ngrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]


In [6]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]


In [232]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in ngrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print(total_loss)
# print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])

520.0139417648315
517.8337724208832
515.665666103363
513.509486913681
511.36414885520935
509.2290771007538
507.10294556617737
504.98508381843567
502.87503957748413
500.7723731994629
tensor([ 0.7493,  1.1847,  0.5673,  0.4106, -0.8592,  0.1352, -1.7765,  1.1096,
         0.4932, -0.4825], grad_fn=<SelectBackward0>)


In [9]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['are', 'We', 'to', 'study'], 'about'), (['about', 'are', 'study', 'the'], 'to'), (['to', 'about', 'the', 'idea'], 'study'), (['study', 'to', 'idea', 'of'], 'the'), (['the', 'study', 'of', 'a'], 'idea')]


In [10]:
class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass

# Create your model and train. Here are some functions to help you make
# the data ready for use by your module.


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

tensor([15, 34, 30, 29])

# 句向量代码实现

https://github.com/cbowdon/doc2vec-pytorch

https://huggingface.co/spacy/en_core_web_sm

In [125]:
!pip install spacy==3.7.6  

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [13]:
!pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting en-core-web-sm==any
  Downloading https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB 93.5 kB/s eta 0:02:17
     ---------------------------------------- 0.0/12.8 MB 93.5 kB/s eta 0:02:17
     --------------------------------------- 0.0/12.8 MB 109.3 kB/s eta 0:01:57
     --------------------------------------- 0.1/12.8 MB 114.1 kB/s eta 0:01:52
     ----------------------------------

en_core_web_sm：English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.  
加载了英语语言的核心模型，该模型包含了一系列的语言处理组件，如分词、词性标注、命名实体识别等。 它是SpaCy库中预训练好的一个小型模型，适用于一般的自然语言处理任务。  

In [6]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

pd.set_option("display.max_colwidth", 100)

example_df = pd.read_csv("doc2vec_example.csv")

In [7]:
i = 3
example_df.iloc[i, 0]

"By the half-light of a suspensor lamp, dimmed and hanging near the floor, the awakened boy could see a bulky female shape at his door, standing one step ahead of his mother. The old woman was a witch shadow - hair like matted spiderwebs, hooded 'round darkness of features, eyes like glittering jewels."

In [8]:
nlp(example_df.iloc[i, 0])

By the half-light of a suspensor lamp, dimmed and hanging near the floor, the awakened boy could see a bulky female shape at his door, standing one step ahead of his mother. The old woman was a witch shadow - hair like matted spiderwebs, hooded 'round darkness of features, eyes like glittering jewels.

In [9]:
type(nlp(example_df.iloc[i, 0]))

spacy.tokens.doc.Doc

In [10]:
def tokenize_text(df):
    df["tokens"] = df.text.str.lower().str.strip().apply(lambda x: [token.text.strip() for token in nlp(x) if token.text.isalnum()])
    return df

example_df = tokenize_text(example_df)

example_df

Unnamed: 0,text,tokens
0,"In the week before their departure to Arrakis, when all the final scurrying about had reached a ...","[in, the, week, before, their, departure, to, arrakis, when, all, the, final, scurrying, about, ..."
1,"It was a warm night at Castle Caladan, and the ancient pile of stone that had served the Atreide...","[it, was, a, warm, night, at, castle, caladan, and, the, ancient, pile, of, stone, that, had, se..."
2,The old woman was let in by the side door down the vaulted passage by Paul's room and she was al...,"[the, old, woman, was, let, in, by, the, side, door, down, the, vaulted, passage, by, paul, room..."
3,"By the half-light of a suspensor lamp, dimmed and hanging near the floor, the awakened boy could...","[by, the, half, light, of, a, suspensor, lamp, dimmed, and, hanging, near, the, floor, the, awak..."


生成一个词袋，给每个词袋增加ID  
We will need to construct a vocabulary so we can reference every word by an ID.

In [11]:
from collections import Counter

class Vocab:
    def __init__(self, all_tokens, min_count=2):
        self.min_count = min_count
        self.freqs = {t:n for t, n in Counter(all_tokens).items() if n >= min_count}
        self.words = sorted(self.freqs.keys())
        self.word2idx = {w: i for i, w in enumerate(self.words)}
        
vocab = Vocab([tok for tokens in example_df.tokens for tok in tokens], min_count=1)

print(f"Dataset comprises {len(example_df)} documents and {len(vocab.words)} unique words (over the limit of {vocab.min_count} occurrences)")

Dataset comprises 4 documents and 106 unique words (over the limit of 1 occurrences)


有一些词出现的很少的话，对结果会有影响，所以可以删除掉，但是这里由于min_count为2，所以实际上这步操作无影响   
Words that appear extremely rarely can harm performance, so we add a simple mechanism to strip those out.

In [12]:
def clean_tokens(df, vocab):
    df["length"] = df.tokens.apply(len)
    df["clean_tokens"] = df.tokens.apply(lambda x: [t for t in x if t in vocab.freqs.keys()])
    df["clean_length"] = df.clean_tokens.apply(len)
    return df

example_df = clean_tokens(example_df, vocab)
example_df[:5]

Unnamed: 0,text,tokens,length,clean_tokens,clean_length
0,"In the week before their departure to Arrakis, when all the final scurrying about had reached a ...","[in, the, week, before, their, departure, to, arrakis, when, all, the, final, scurrying, about, ...",32,"[in, the, week, before, their, departure, to, arrakis, when, all, the, final, scurrying, about, ...",32
1,"It was a warm night at Castle Caladan, and the ancient pile of stone that had served the Atreide...","[it, was, a, warm, night, at, castle, caladan, and, the, ancient, pile, of, stone, that, had, se...",39,"[it, was, a, warm, night, at, castle, caladan, and, the, ancient, pile, of, stone, that, had, se...",39
2,The old woman was let in by the side door down the vaulted passage by Paul's room and she was al...,"[the, old, woman, was, let, in, by, the, side, door, down, the, vaulted, passage, by, paul, room...",34,"[the, old, woman, was, let, in, by, the, side, door, down, the, vaulted, passage, by, paul, room...",34
3,"By the half-light of a suspensor lamp, dimmed and hanging near the floor, the awakened boy could...","[by, the, half, light, of, a, suspensor, lamp, dimmed, and, hanging, near, the, floor, the, awak...",53,"[by, the, half, light, of, a, suspensor, lamp, dimmed, and, hanging, near, the, floor, the, awak...",53


The difficulty with our "the cat _ on the mat" problem is that the missing word could be any one in the vocabulary V and so the network would have |V| outputs for each input e.g. a huge vector containing zero for every word in the vocabulary and some positive number for "sat" if the network was perfectly trained. For calculating loss we need to turn that into a probabilty distribution, i.e. _softmax_ it. Computing the softmax for such a large vector is expensive.

So the trick (one of many possible) we will use is _Noise Contrastive Estimation (NCE)_. We change our "the cat _ on the mat" problem into a multiple choice problem, asking the network to choose between "sat" and some random wrong answers like "hopscotch" and "luxuriated". This is easier to compute the softmax for since it's now a binary classifier (right or wrong answer) and the output is simply of a vector of size 1 + k where k is the number of random incorrect options.

Happily, this alternative problem still learns equally useful word representations. We just need to adjust the examples and the loss function. There is a simplified version of the NCE loss function called _Negative Sampling (NEG)_ that we can use here.

[Notes on Noise Contrastive Estimation and Negative Sampling (C. Dyer)](https://arxiv.org/abs/1410.8251) explains the derivation of the NCE and NEG loss functions.

When we implement the loss function, we assume that the first element in a samples/scores vector is the score for the positive sample and the rest are negative samples. This convention saves us from having to pass around an auxiliary vector indicating which sample was positive.

ChatGPT翻译：
m
这段话主要解释了如何优化神经网络处理词填空问题时的计算效率，特别是在处理大词汇表的情况下。

问题背景：
在“the cat _ on the mat”这样的句子中，_ 处缺失的单词可能是词汇表 V 中的任何一个词。这意味着，神经网络需要为每个输入（句子）生成 |V| 个输出（即词汇表中每个词的概率），这是非常耗费计算资源的。为了得到这些输出的概率分布，通常需要对这些值进行 softmax 操作，但由于向量非常大，这一计算过程非常昂贵。

解决方案：
为了优化这个问题，作者提出了一种称为 噪声对比估计 (Noise Contrastive Estimation, NCE) 的方法。具体来说，这种方法将原来的词填空问题转换为一个多选问题。

具体步骤：
转换为多选问题：

原本的问题是让网络在整个词汇表 V 中找到正确的词（比如“sat”）。
现在的问题是让网络在正确答案（“sat”）和一些随机错误答案（比如“hopscotch”和“luxuriated”）之间选择正确的一个。
计算优势：

由于现在网络只需要在一个较小的选项集合中进行选择（正确答案 + 一些随机错误答案），而不再是整个词汇表，所以计算 softmax 的负担大大减轻。
这个新的问题相当于一个二分类任务（正确或错误），输出向量的大小变成了 1 + k，其中 k 是随机错误选项的数量。
学习效果：

尽管问题形式发生了变化，网络仍然可以学习到同样有用的词表示。
作者还提到了一种 NCE 损失函数的简化版本，称为 负采样 (Negative Sampling, NEG)，可以用于这个情境下。
实现细节：

在实现这个损失函数时，默认约定样本/得分向量中的第一个元素是正样本的得分，其他是负样本的得分。这样可以减少传递额外信息的需求，提高效率。
总结：
简而言之，这段话讨论了一种通过转换问题形式（从寻找正确单词变为多选）来减少计算负担的方法，同时仍然能够达到相似的学习效果。这种方法主要用于优化大词汇表下的神经网络训练。m

负采样（Negative Sampling）损失的 PyTorch 模块

In [13]:
import torch.nn as nn

class NegativeSampling(nn.Module):
    def __init__(self):
        super(NegativeSampling, self).__init__()
        self.log_sigmoid = nn.LogSigmoid()
    def forward(self, scores):
        batch_size = scores.shape[0]
        n_negative_samples = scores.shape[1] - 1   # TODO average or sum the negative samples? Summing seems to be correct by the paper
        positive = self.log_sigmoid(scores[:,0])  # 取第1个，是正样本的预测概率
        negatives = torch.sum(self.log_sigmoid(-scores[:,1:]), dim=1)  # 取后面3个
        return -torch.sum(positive + negatives) / batch_size  # average for batch

loss = NegativeSampling()

$$\text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)$$

测试一下函数的输出结果  
It's helpful to play with some values to reassure ourselves that this function does the right thing.

In [14]:
import torch 

# 第一行表示一个理想的情况：模型对正样本的得分较高（1），对负样本的得分较低（-1）。在这种情况下，损失函数的输出应该较小，因为模型已经很好地完成了任务。
# 倒数第二行的第一列的值是 0.5，表示模型对正样本的预测得分为 0.5。这意味着模型对这个正样本的置信度不是很高，但仍然认为它可能是正确的。
# 最后一行表示一个糟糕的情况：模型对正样本和负样本的得分相同，完全没有区分能力。损失函数在这种情况下应该输出一个较大的值，以迫使模型在训练过程中进行调整，降低负样本的得分。
data = [[[1, -1, -1, -1]],  # this dummy data uses -1 to 1, but the real model is unconstrained 
        [[0.5, -1, -1, -1]],
        [[0, -1, -1, -1]],
        [[0, 0, 0, 0]],
        [[0, 0, 0, 1]],
        [[0, 1, 1, 1]],
        [[0.5, 1, 1, 1]],
        [[1, 1, 1, 1]]]

loss_df = pd.DataFrame(data, columns=["scores"])
loss_df["loss"] = loss_df.scores.apply(lambda x: loss(torch.FloatTensor([x])))

loss_df

Unnamed: 0,scores,loss
0,"[1, -1, -1, -1]",tensor(1.2530)
1,"[0.5, -1, -1, -1]",tensor(1.4139)
2,"[0, -1, -1, -1]",tensor(1.6329)
3,"[0, 0, 0, 0]",tensor(2.7726)
4,"[0, 0, 0, 1]",tensor(3.3927)
5,"[0, 1, 1, 1]",tensor(4.6329)
6,"[0.5, 1, 1, 1]",tensor(4.4139)
7,"[1, 1, 1, 1]",tensor(4.2530)


Higher scores for the positive sample (always the first element) reduce the loss but higher scores for the negative samples increase the loss. This looks like the right behaviour.

With that in the bag, let's look at creating training data. The general idea is to create a set of examples where each example has:

- doc id
- sample ids - a collection of the target token and some noise tokens
- context ids - tokens before and after the target token

e.g. If our context size was 2, the first example from the above dataset would be:

```
{"doc_id": 0,
 # 样本ID集合包含目标词"week"的ID和一些从词汇表中随机挑选出的噪声词的ID
 "sample_ids": [word2idx[x] for x in ["week", "random-word-from-vocab", "random-word-from-vocab"...],
 # 上下文ID集合包含目标词前后的词汇“in”、“the”、“before”和“their”的ID
 "context_ids": [word2idx[x] for x in ["in", "the", "before", "their"]]}
 ```
 
 The random words are chosen according to a probability distribution:
 这些随机词是按照某种概率分布来选择的：使用的是一个单词出现频率的分布，然后将其提升到3/4次方，这种方法由T. Mikolov等人在《Distributed Representations of Words and Phrases and their Compositionality》中提出。
 > a unigram distribution raised to the 3/4rd power, as proposed by T. Mikolov et al. in Distributed Representations of Words and Phrases and their Compositionality

这么做的效果是略微增加了罕见词汇的相对概率。从y=x^0.75的图像可以看出，曲线在低频部分（y轴较小的部分）略高于y=x，这意味着低频词的选择概率被适当提高了。
This has the effect of slightly increasing the relative probability of rare words (look at the graph of `y=x^0.75` below and see how the lower end is raised above `y=x`).

In [19]:
!pip install altair==5.4.1

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [20]:
import altair as alt
import numpy as np

data = pd.DataFrame(zip(np.arange(0,1,0.01), np.power(np.arange(0,1,0.01), 0.75)), columns=["x", "y"])
alt.Chart(data, title="x^0.75").mark_line().encode(x="x", y="y")

AttributeError: module jsonschema has no attribute exceptions

In [21]:
import numpy as np

class NoiseDistribution:
    def __init__(self, vocab):
        self.probs = np.array([vocab.freqs[w] for w in vocab.words])
        self.probs = np.power(self.probs, 0.75)
        self.probs /= np.sum(self.probs)
    def sample(self, n):
        "Returns the indices of n words randomly sampled from the vocabulary."
        return np.random.choice(a=self.probs.shape[0], size=n, p=self.probs)
        
noise = NoiseDistribution(vocab)

开始产生训练样本，正样本永远放在第一个   
With this distribution, we advance through the documents creating examples. Note that we are always putting the positive sample first in the samples vector, following the convention the loss function expects.

In [22]:
import torch

def example_generator(df, context_size, noise, n_negative_samples, vocab):
    for doc_id, doc in df.iterrows():
        for i in range(context_size, len(doc.clean_tokens) - context_size):
            positive_sample = vocab.word2idx[doc.clean_tokens[i]]
            sample_ids = noise.sample(n_negative_samples).tolist()
            # Fix a wee bug - ensure negative samples don't accidentally include the positive
            sample_ids = [sample_id if sample_id != positive_sample else -1 for sample_id in sample_ids]
            sample_ids.insert(0, positive_sample)                
            context = doc.clean_tokens[i - context_size:i] + doc.clean_tokens[i + 1:i + context_size + 1]
            context_ids = [vocab.word2idx[w] for w in context]
            yield {"doc_ids": torch.tensor(doc_id),  # we use plural here because it will be batched
                   "sample_ids": torch.tensor(sample_ids), 
                   "context_ids": torch.tensor(context_ids)}
            
# 上下文为5，负样本个数每次随机出5个
examples = example_generator(example_df, context_size=5, noise=noise, n_negative_samples=5, vocab=vocab)

In [23]:
examples

<generator object example_generator at 0x0000021E8FFC9E40>

Now we package this up as a good old PyTorch dataset and dataloader.

In [24]:
from torch.utils.data import Dataset, DataLoader

class NCEDataset(Dataset):
    def __init__(self, examples):
        self.examples = list(examples)  # just naively evaluate the whole damn thing - suboptimal!
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, index):
        return self.examples[index]
    
dataset = NCEDataset(examples)
dataloader = DataLoader(dataset, batch_size=2, drop_last=True, shuffle=True)  # TODO bigger batch size when not dummy data

In [25]:
len(dataset.examples)

118

In [26]:
dataset.examples[:3]

[{'doc_ids': tensor(0),
  'sample_ids': tensor([28, 57, 18, 38, 67, 92]),
  'context_ids': tensor([ 52,  91, 101,  15,  92,  93,   9, 102,   4,  91])},
 {'doc_ids': tensor(0),
  'sample_ids': tensor([93, 82, 39, 66, 52, 12]),
  'context_ids': tensor([ 91, 101,  15,  92,  28,   9, 102,   4,  91,  37])},
 {'doc_ids': tensor(0),
  'sample_ids': tensor([ 9,  0, 86, 85, 66, 27]),
  'context_ids': tensor([101,  15,  92,  28,  93, 102,   4,  91,  37,  76])}]

后续要分割batch，所以这里给batch增加一些信息，方便DEBUG可读性  
It's going to also be useful to have a way to convert batches back to a readable form for debugging, so we add a helper function.

In [27]:
def describe_batch(batch, vocab):
    results = []
    for doc_id, context_ids, sample_ids in zip(batch["doc_ids"], batch["context_ids"], batch["sample_ids"]):
        context = [vocab.words[i] for i in context_ids]
        context.insert(len(context_ids) // 2, "____")
        samples = [vocab.words[i] for i in sample_ids]
        result = {"doc_id": doc_id,
                  "context": " ".join(context), 
                  "context_ids": context_ids, 
                  "samples": samples, 
                  "sample_ids": sample_ids}
        results.append(result)
    return results

describe_batch(next(iter(dataloader)), vocab)

[{'doc_id': tensor(3),
  'context': 'of his mother the old ____ was a witch shadow hair',
  'context_ids': tensor([ 66,  49,  62,  91,  67,  99,   0, 104,  79,  44]),
  'samples': ['woman', 'weather', 'night', 'cooled', 'family', 'let'],
  'sample_ids': tensor([105, 100,  65,  24,  33,  57])},
 {'doc_id': tensor(0),
  'context': 'had reached a nearly unbearable ____ an old crone came to',
  'context_ids': tensor([43, 73,  0, 64, 95,  6, 67, 26, 21, 93]),
  'samples': ['frenzy', 'bore', 'side', 'hooded', 'hair', 'the'],
  'sample_ids': tensor([40, 16, 82, 51, 44, 91])}]

把文档和句子都纳入输入计算当中  
Let's jump into creating the model itself. There isn't much to it - we multiply the input paragraph and word matrices by the output layer. Combining the paragraph and word matrices is done by summing here, but it could also be done by concatenating the inputs. The original paper actually found concatenation works better, perhaps because summing loses word order information.

In [28]:
import torch.nn as nn

class DistributedMemory(nn.Module):
    def __init__(self, vec_dim, n_docs, n_words):
        super(DistributedMemory, self).__init__()
        self.paragraph_matrix = nn.Parameter(torch.randn(n_docs, vec_dim))
        self.word_matrix = nn.Parameter(torch.randn(n_words, vec_dim))
        self.outputs = nn.Parameter(torch.zeros(vec_dim, n_words))
    
    def forward(self, doc_ids, context_ids, sample_ids):
        #print(self.paragraph_matrix[doc_ids,:].shape)                                                                       
        #print(torch.sum(self.word_matrix[context_ids,:], dim=1).shape)
        # 将句子向量和词向量（上下文）加和后训练，最后两个向量都能够拿的到           # first add doc ids to context word ids to make the inputs                    
        inputs = torch.add(self.paragraph_matrix[doc_ids,:],                   # (batch_size, vec_dim)
                           torch.sum(self.word_matrix[context_ids,:], dim=1))  # (batch_size, 2x context, vec_dim) -> sum to (batch_size, vec_dim)
                                                                               #
        #print(inputs.shape)                                                   # select the subset of the output layer for the NCE test
        outputs = self.outputs[:,sample_ids]                                   # (vec_dim, batch_size, n_negative_samples + 1)
                                                                               #
        return torch.bmm(inputs.unsqueeze(dim=1),                              # then multiply with some munging to make the tensor shapes line up 
                         outputs.permute(1, 0, 2)).squeeze()                   # -> (batch_size, n_negative_samples + 1)

model = DistributedMemory(vec_dim=50,  # 每个句子的向量长度设置为50个
                          n_docs=len(example_df),
                          n_words=len(vocab.words))

In [29]:
model.paragraph_matrix[0,:].shape

torch.Size([50])

In [30]:
model.word_matrix[[ 4, 91, 37, 76,  1, 73,  0, 64, 95, 40]].shape

torch.Size([10, 50])

简单试试  
Let's take it for a spin!

In [31]:
with torch.no_grad():
    logits = model.forward(**next(iter(dataloader)))
logits

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

开始循环训练  
Oh yeah, the output layer was initialized with zeros. Time to bash out a standard issue PyTorch training loop.

In [32]:
from tqdm import tqdm, trange
from torch.optim import Adam  # ilenic uses Adam, but gensim uses plain SGD
import numpy as np

def train(model, dataloader, epochs=40, lr=1e-3):
    optimizer = Adam(model.parameters(), lr=lr)
    training_losses = []
    try:
        for epoch in trange(epochs, desc="Epochs"):
            epoch_losses = []
            for batch in dataloader:
                model.zero_grad()
                logits = model.forward(**batch)
                batch_loss = loss(logits)
                epoch_losses.append(batch_loss.item())
                batch_loss.backward()
                optimizer.step()
            training_losses.append(np.mean(epoch_losses))
    except KeyboardInterrupt:
        print(f"Interrupted on epoch {epoch}!")
    finally:
        return training_losses

Now we'll sanity check by overfitting the example data. Training loss should drop from untrained loss to something close to the minimum possible.

In [33]:
training_losses = train(model, dataloader, epochs=40, lr=1e-3)

Epochs: 100%|██████████████████████████████████████████████████████████████████████████| 40/40 [00:03<00:00, 12.53it/s]


In [1]:
pd.Series(training_losses).plot()

NameError: name 'pd' is not defined

因为我们有些多疑，让我们检查一下预测结果。   
And because we're paranoid types, let's check a prediction.

In [116]:
with torch.no_grad():
    logits = model.forward(**next(iter(dataloader)))
logits

tensor([[ 10.1291, -18.3573, -15.3337, -14.7885, -13.9782, -15.3337],
        [ 15.4130, -21.7766, -19.5528, -19.1346, -17.1858, -17.8984]])

正负样本能够区分的开，证明训练还不错  
The positive sample gets a positive score and the negatives get negative scores. Super.

获取一下句向量的结果，然后看一下相似度  
We should be able get the paragraph vectors for the documents and do things like check these for similarity to one another.

In [120]:
model.paragraph_matrix.data

tensor([[ 2.6257e+00, -5.9801e-01, -2.0787e+00,  5.8892e-01,  3.0806e-01,
          8.8510e-02, -1.1012e+00, -3.3149e+00,  1.5721e-01,  2.3342e+00,
         -4.1842e-01,  7.7001e-01, -2.9533e-01,  6.4160e-01,  4.2092e-01,
         -3.5608e-01, -2.4942e+00,  5.5928e-01, -1.9541e+00,  1.7930e+00,
         -7.9386e-02,  3.7144e-01, -2.5290e+00,  5.2262e-01, -2.3597e+00,
         -1.9081e+00, -5.9890e-01, -1.5461e+00,  7.3952e-01, -5.2003e-02,
          7.9606e-01,  1.7315e+00,  5.0672e-01, -7.2812e-01,  1.1634e+00,
         -1.1671e+00,  6.6750e-01,  5.8549e-01,  1.0763e+00,  9.2112e-01,
          5.7500e-01, -2.2590e+00,  2.3657e+00,  5.2215e-01,  5.0826e-01,
         -1.3147e+00,  1.1834e+00,  4.7461e-01,  2.4441e+00, -7.1212e-01],
        [ 1.1969e+00,  4.8611e-01,  1.6409e+00, -2.2397e-01, -1.1347e+00,
          1.3665e+00, -4.7226e-01, -1.1703e-01,  6.1717e-02,  4.9905e-01,
         -6.6897e-01, -4.8154e-01, -8.4680e-01,  2.2461e-01,  1.6529e+00,
         -9.9773e-01, -9.3588e-01,  2

In [121]:
model.paragraph_matrix.data.shape

torch.Size([4, 50])

In [101]:
from sklearn.preprocessing import normalize

def most_similar(paragraph_matrix, docs_df, index, n=None):
    pm = normalize(paragraph_matrix, norm="l2")  # in a smarter implementation we would cache this somewhere
    sims = np.dot(pm, pm[index,:])
    df = pd.DataFrame(enumerate(sims), columns=["doc_id", "similarity"])
    n = n if n is not None else len(sims)
    return df.merge(docs_df[["text"]].reset_index(drop=True), left_index=True, right_index=True).sort_values(by="similarity", ascending=False)[:n]

most_similar(model.paragraph_matrix.data, example_df, 1, n=10)

Unnamed: 0,doc_id,similarity,text
1,1,1.0,"It was a warm night at Castle Caladan, and the ancient pile of stone that had served the Atreide..."
3,3,0.386888,"By the half-light of a suspensor lamp, dimmed and hanging near the floor, the awakened boy could..."
0,0,0.274491,"In the week before their departure to Arrakis, when all the final scurrying about had reached a ..."
2,2,0.066408,The old woman was let in by the side door down the vaulted passage by Paul's room and she was al...


不过，对于我们这小小的虚拟数据集来说，这并没有特别启发性的作用。我们还可以使用主成分分析（PCA）将我们的 n 维段落向量降维到 2 维，看看它们是否能够很好地聚类。  
It's not particularly illuminating for our tiny set of dummy data though. We can also use PCA to reduce our n-dimensional paragraph vectors to 2 dimensions and see if they are clustered nicely.

In [102]:
from sklearn.decomposition import PCA

def pca_2d(paragraph_matrix, groups):
    pca = PCA(n_components=2)
    reduced_dims = pca.fit_transform(paragraph_matrix)
    print(f"2-component PCA, explains {sum(pca.explained_variance_):.2f}% of variance")
    df = pd.DataFrame(reduced_dims, columns=["x", "y"])
    df["group"] = groups
    return df

example_2d = pca_2d(model.paragraph_matrix.data, ["0","1","2","3"])
alt.Chart(example_2d).mark_point().encode(x="x", y="y", color="group")

2-component PCA, explains 54.34% of variance


# Faiss匹配实现

In [128]:
!pip install qianfan==0.4.6

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [134]:
!pip install langchain==0.2.12

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [133]:
!pip install langchain_community==0.2.11

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [136]:
!pip install faiss-cpu==1.8.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [131]:
qianfan_ak = '你的AK'
qianfan_sk = '你的SK'

用的是智源研究院的embedding模型：https://huggingface.co/BAAI/bge-large-zh

In [175]:
from typing import List
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain.embeddings.baidu_qianfan_endpoint import QianfanEmbeddingsEndpoint

str_docs = [
    # 句子1，句意有农业保险
    '实施金融支持农业纾难解困政策。鼓励市县积极开展特色渔业养殖保险，在参保农户自缴保费比例不低于20%的前提下，对险种绩效评价结果达标的市县，省级财政按'
    '照25%的比例给予保费补贴。对2022年8月至12月到期的农民小额贷款和新发生的农民小额贷款贴息由5%提升至6%。对脱贫人口小额信贷，允许其调整还本计划或办'
    '理贷款展期、续贷。对受疫情影响暂时出现还贷困难的涉农企业及农户(包括脱贫户、监测户)，支持银行机构按市场化原则予以降息、减息或免息扶持，开展征信保护等。',
    # 句子2，句意无农业保险
    '分级分类开展社会化服务。针对中高风险区农业生产人员无法外出生产问题，组织有关企业和社会化服务组织提供托管、代耕、代收服务。各村委会统计当地需要种植'
    '或收获的作物品种、面积、产量，乡镇政府商请当地农业农村局统一协调专业化服务组织提供托管服务。当地力量不足时，市县农业农村局向省农业农村厅申请统一协调安排',
    # 句子3，句子中有农业保险
    '财政支持。各级财政部门履行牵头主责，会同有关部门从发展方向、制度设计、政策制定、资金保障等方面推进农业保险发展，通过保费补贴、机构遴选等多种政策手'
    '段，发挥农业保险机制性工具作用，督促承保机构依法合规展业，充分调动各参与方积极性，推动农业保险高质量发展。'
]
doc_docs = [Document(i) for i in str_docs]

embedding_model = QianfanEmbeddingsEndpoint(qianfan_ak=qianfan_ak, qianfan_sk=qianfan_sk, model='bge-large-zh')
db = FAISS.from_documents(doc_docs, embedding_model)

In [177]:
embedding_model

QianfanEmbeddingsEndpoint(qianfan_ak=None, qianfan_sk=None, chunk_size=16, model='bge-large-zh', endpoint='', client=<qianfan.resources.llm.embedding.Embedding object at 0x000002288A022C10>, init_kwargs={}, model_kwargs={})

faiss.IndexFlatL2 是一个用于执行 L2 距离（欧几里得距离）计算的简单索引，但它不提供直接获取已添加嵌入向量的功能。每次添加向量到 FAISS 索引时，都应同时将这些向量保存在你能访问的地方。









In [178]:
db.index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000022889FC3DB0> >

In [179]:
db.index.d

1024

In [181]:
# 通过嵌入模型获取文本的嵌入向量
embeddings = embedding_model.embed_documents(str_docs)

In [185]:
len(embeddings)

3

In [186]:
len(embeddings[0])

1024

In [140]:
db_path = 'qianfan.db'
# faiss是索引，pkl是文档
db.save_local(db_path)  # 存到本地

In [258]:
# db = FAISS.load_local(db_path, embedding, allow_dangerous_deserialization=True)

In [152]:
query = "农业保险"
docs_and_scores = db.similarity_search_with_score(query)  # 返回分数和内容

for i, j in docs_and_scores:
    print(i, '\n', j)

page_content='财政支持。各级财政部门履行牵头主责，会同有关部门从发展方向、制度设计、政策制定、资金保障等方面推进农业保险发展，通过保费补贴、机构遴选等多种政策手段，发挥农业保险机制性工具作用，督促承保机构依法合规展业，充分调动各参与方积极性，推动农业保险高质量发展。' 
 0.34435654
page_content='实施金融支持农业纾难解困政策。鼓励市县积极开展特色渔业养殖保险，在参保农户自缴保费比例不低于20%的前提下，对险种绩效评价结果达标的市县，省级财政按照25%的比例给予保费补贴。对2022年8月至12月到期的农民小额贷款和新发生的农民小额贷款贴息由5%提升至6%。对脱贫人口小额信贷，允许其调整还本计划或办理贷款展期、续贷。对受疫情影响暂时出现还贷困难的涉农企业及农户(包括脱贫户、监测户)，支持银行机构按市场化原则予以降息、减息或免息扶持，开展征信保护等。' 
 0.4557104
page_content='分级分类开展社会化服务。针对中高风险区农业生产人员无法外出生产问题，组织有关企业和社会化服务组织提供托管、代耕、代收服务。各村委会统计当地需要种植或收获的作物品种、面积、产量，乡镇政府商请当地农业农村局统一协调专业化服务组织提供托管服务。当地力量不足时，市县农业农村局向省农业农村厅申请统一协调安排' 
 0.520805
