# 利用 N-Gram 的邏輯計算詞向量

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm

# Basic Step:

Word Embedding 有幾種形式：
1. N-Gram Language Modeling: 根據前幾個字來計算文字的相似性
2. Continuous Bag-of-Words: 看前後幾個字來計算文字的相似性

基本流程為:
1. 將各自的文字先轉成index
2. 依據計算的方法，取出前後幾個字當input，中間那個字當label（類似建構分類器）
3. 丟到模型裡面train loss
4. 最後Embedding 那一層即是詞向量

# N-Gram Language Modeling

In [10]:
# 先決定要取前幾個字當特徵
CONTEXT_SIZE = 2
#決定詞向量的維度
EMBEDDING_DIM = 10

# We will use Shakespeare Sonnet 2
# 訓練資料
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
# 將文字的前兩個字與自己本身建立一個tuple表
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

# print the first 3, just so you can see what they look like
print('trigrams :',trigrams[:3])

# 篩選出有出現的字
# 將不重複的字過濾出來
vocab = set(test_sentence)

# 按字建立成index
word_to_ix = {word: i for i, word in enumerate(vocab)}

class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        '''
        1. 先將文字做Embedding
        2. 轉成一個維度的向量
        3. 過第一層的linear層
        4. 過relu
        5. 過第二層的linear層
        6. 過softmax取log
        '''
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
# Loss function 用negative loss likelihood
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
# Optimizer 取 SGD
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in tqdm.tqdm_notebook(range(20)):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        # 第一步要先把word 轉成index，取long 型態
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        # 初始化gradients
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        # 將文字帶入模型中
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        # 計算預測出的loss
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        # Update gradients
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print(losses)  # The loss decreased every iteration over the training data!


trigrams : [(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]



[519.3968162536621, 517.0977153778076, 514.8128852844238, 512.5397455692291, 510.2784435749054, 508.0292880535126, 505.7910957336426, 503.5622413158417, 501.3425679206848, 499.1313352584839, 496.92713928222656, 494.73046827316284, 492.54167079925537, 490.3594100475311, 488.18209195137024, 486.00930166244507, 483.8398723602295, 481.6737720966339, 479.5093538761139, 477.34614753723145]


# Some description

In [11]:
#文字轉成index之後長這樣
context_idxs

tensor([ 28,  96])

In [13]:
#可以拿來預測文字
torch.max(model(context_idxs),1)

(tensor([-4.1102]), tensor([ 95]))

In [19]:
list(word_to_ix.keys())[95]

'see'

In [29]:
a = torch.tensor(95)

In [26]:
# 實際提取出來的詞向量
model.embeddings(context_idxs)

tensor([[ 0.4290, -0.8208, -0.3947, -0.6477,  1.1482,  1.0585,  1.1270,
          1.3733,  0.6172, -0.0597],
        [ 0.9138, -0.1938, -0.8241,  0.0653,  0.9324,  0.9784, -0.0023,
          2.1612,  0.5414, -1.3719]])

In [30]:
model.embeddings(a)

tensor([-0.5535, -0.2378,  0.6353, -1.7482,  2.2924,  0.2154, -0.0268,
        -0.5499,  0.1086,  0.4543])