In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#### 数据准备

In [2]:
# 这个数据预处理过程没有做，应该将句子根据标点符号分开，然后处理标点符号；
test_sentence = """
A gifted American psychologist has said ,“Worry is a spasm of the emotion; 
the mind catches hold of something and will not let it go.” It is useless 
to argue with the mind in this condition. The stronger the will, the more 
futile the task. One can only gently insinuate something else into its 
convulsive grasp. And if this something else is rightly chosen, if it is 
really attended by the illumination of another field of interest, gradually, 
and often swiftly, the old undue grip relaxes and the process of recuperation 
and repair ,begins.The cultivation of a hobby and new forms of interest 
is therefore a policy of first importance to a public man. But this is not 
a business that can be undertaken in a day or swiftly improvised by a mere 
command of the will. The growth of alternative mental interests is a long 
process. The seeds must be carefully chosen; they must fall on good ground; 
they must be sedulously tended, if the vivifying fruits are to be at hand 
when needed. To be really happy and really safe, one ought to have at least 
two or three hobbies, and they must all be real. It is no use starting late 
in life to say: "I will take an interest in this or that." Such an attempt 
only aggravates the strain of mental effort. A man may acquire great knowledge 
of topics unconnected with his daily work, and yet hardly get any benefit or 
relief. It is no use doing what you like: you have got to like what you do. 
Broadly speaking, human beings may be divided into three classes: those who 
are toiled to death, those who are worried to death, and those who are bored 
to death. It is no use offering the manual labourer, tired out with a hard 
week's sweat and effort，the chance of playing a game of football or baseball 
on Saturday afternoon. It is no use inviting the politician or the professional 
or businessman, who has been working or worrying about serious things for six 
days, to work or worry about trifling things at the week-end.""".split()

#### 初始化参数

In [3]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 20

#### 数据标准化及字典构造

In [4]:
# 把材料构造成三元语法语料
trigrams = [([test_sentence[i], test_sentence[i+1]], test_sentence[i+2]) 
            for i in range(len(test_sentence) - 2)]

In [5]:
# 构造字典（单词及其对应的所有）
vocabulary = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocabulary)}

#### 构造模型

In [6]:
class TriGram(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, context_size, n_hidden):
        super(TriGram, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        # print("the original embedding is : ", self.embedding.weight)  初始化的词向量矩阵
        self.Linear1 = nn.Linear(context_size * embedding_dim, n_hidden)
        self.Linear2 = nn.Linear(n_hidden, vocabulary_size)
        
    def forward(self, inputs):
        embeds = self.embedding(inputs).view(1, -1)
        out = F.relu(self.Linear1(embeds))
        out = self.Linear2(out)
        log_probs = F.log_softmax(out, dim=1)  # 输出归一化的概率
        return log_probs

losses = []
loss_function = nn.NLLLoss()
model = TriGram(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, 100)
optimizer = torch.optim.SGD(model.parameters(), lr=0.044)

#### 训练模型

In [7]:
for epoch in range(100):
    total_loss = 0
    for context, target in trigrams:
        context_idx = [word_to_ix[word] for word in context]  # 产生单词对应的索引号
        context_var = torch.LongTensor(context_idx)           # 在model.embedding中查询时索
                                                              #引必须是tensor数据类型的列表
        log_probs = model(context_var)
        loss = loss_function(log_probs, torch.LongTensor([word_to_ix[target]]))
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.data.numpy()  # 将torch张量转换成python数据类型
    losses.append(total_loss)

#### 训练后的参数查看

In [8]:
# 损失函数的值记录
losses[-10:]

[35.974687576293945,
 35.900949478149414,
 35.82237243652344,
 35.75556659698486,
 35.696616649627686,
 35.644667625427246,
 35.57563257217407,
 35.572914123535156,
 35.48055648803711,
 35.43225288391113]

In [9]:
# 所有已训练的参数查看
for i in model.parameters():
    print(i)
    print(i.size())
    print("--------------------------------")
# 第一部分是词向量矩阵参数
# 第二部分是第一个隐藏层神经网络参数
# 第三部分是第一个隐藏层神经元参数
# 第四部分是第二个隐藏层神经网络参数
# 第五部分是第二个隐藏层神经元参数

Parameter containing:
tensor([[ 1.3247,  0.9301, -1.3573,  ..., -0.3592, -0.2334,  1.2196],
        [ 0.9144, -0.9955,  1.5296,  ..., -0.2890,  0.8897,  0.2187],
        [-1.6678,  0.2426, -0.1024,  ..., -0.3030,  0.2784, -0.2243],
        ...,
        [ 1.2325, -1.1330,  0.5589,  ...,  0.8175,  1.0662,  1.6292],
        [ 1.6744,  0.3188, -0.0541,  ...,  1.3325, -0.1892,  1.6094],
        [ 0.9994,  1.4078, -2.1743,  ...,  0.8051,  0.7900,  0.9466]],
       requires_grad=True)
torch.Size([212, 20])
--------------------------------
Parameter containing:
tensor([[ 0.1344, -0.1569, -0.0925,  ...,  0.1955, -0.1501, -0.5005],
        [-0.2007, -0.1836,  0.1702,  ...,  0.1511, -0.2214,  0.2219],
        [-0.2359, -0.1666, -0.0539,  ...,  0.0777, -0.4448,  0.0176],
        ...,
        [ 0.1075,  0.5467, -0.4375,  ..., -0.7483,  0.0772,  0.0534],
        [ 0.4177,  0.2394,  0.3525,  ..., -0.2348,  0.1393, -0.1571],
        [ 0.0417, -0.1167, -0.1147,  ..., -0.4520,  0.3573, -0.2919]],
      

#### 模型预测

In [10]:
words = ["worry", "about"]
idxes = [word_to_ix[word] for word in words]
var = torch.LongTensor([idxes])
log_pro = model(var)
log_pro  # 是一个二维矩阵

tensor([[-1.3260e+01, -1.6079e+01, -1.4686e+01, -1.3130e+01, -1.3127e+01,
         -1.3303e+01, -1.2622e+01, -1.4933e+01, -1.2443e+01, -1.5048e+01,
         -2.3489e-03, -1.5741e+01, -1.4513e+01, -8.5724e+00, -1.2366e+01,
         -1.5997e+01, -1.6008e+01, -1.6568e+01, -1.7033e+01, -1.3998e+01,
         -1.1984e+01, -8.3298e+00, -1.5376e+01, -1.7780e+01, -1.7302e+01,
         -1.5183e+01, -1.5807e+01, -1.3663e+01, -1.4328e+01, -1.4960e+01,
         -1.3636e+01, -1.3667e+01, -1.5580e+01, -1.6290e+01, -1.4054e+01,
         -1.6862e+01, -1.5225e+01, -1.3880e+01, -1.3614e+01, -1.3389e+01,
         -1.8145e+01, -1.0662e+01, -1.1804e+01, -1.6987e+01, -1.2963e+01,
         -1.6250e+01, -1.6539e+01, -1.2751e+01, -1.4974e+01, -1.5406e+01,
         -1.7230e+01, -1.1295e+01, -1.2057e+01, -1.5164e+01, -1.2064e+01,
         -1.3241e+01, -1.5192e+01, -1.6498e+01, -1.7651e+01, -8.5439e+00,
         -1.3256e+01, -1.5602e+01, -1.5434e+01, -1.5380e+01, -1.8464e+01,
         -1.7216e+01, -1.6582e+01, -1.

In [11]:
m = log_pro.data.numpy().tolist()[0]  # [0] 选择维度
index = m.index(max(m))  # 最大的参数的索引即为所预测的单词
index

10

In [12]:
for word, identify in word_to_ix.items():
    if identify == index:
        print(word)
        break

trifling
