In [28]:
import torch
import numpy as np
from torch import nn, optim
import random
from collections import Counter
import matplotlib.pyplot as plt

# 训练数据
text = "I like dog i like cat i like animal dog cat animal apple cat dog like dog fish milk like dog \
cat eyes like i like apple apple i hate apple i movie book music like cat dog hate cat dog like"

# 参数设置
EMBEDDING_DIM = 2  # 词向量维度
PRINT_EVERY = 1000  # 可视化频率
EPOCHS = 1000  # 训练的轮数
BATCH_SIZE = 5  # 每一批训练数据大小
N_SAMPLES = 3  # 负样本大小
WINDOW_SIZE = 5  # 周边词窗口大小
FREQ = 0  # 词汇出现频率
DELETE_WORDS = False  # 是否删除部分高频词

In [18]:
# 文本预处理
def preprocess(text, FREQ):
    text = text.lower()
    text = text.replace('"', "").replace('.', '').replace(',', '').replace('!', '').replace('-', '').replace('/', '')
    words = text.split()
    # 去除低频词
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > FREQ]
    return trimmed_words

In [29]:
words = preprocess(text, FREQ)
# 构建词典
vocab = set(words)
vocab2int = {w: c for c, w in enumerate(vocab)}
int2vocab = {c: w for c, w in enumerate(vocab)}

In [30]:
len(words),len(vocab)

(42, 13)

In [31]:
vocab2int

{'animal': 0,
 'cat': 1,
 'music': 2,
 'fish': 3,
 'hate': 4,
 'dog': 5,
 'book': 6,
 'eyes': 7,
 'apple': 8,
 'like': 9,
 'i': 10,
 'milk': 11,
 'movie': 12}

In [32]:
int2vocab

{0: 'animal',
 1: 'cat',
 2: 'music',
 3: 'fish',
 4: 'hate',
 5: 'dog',
 6: 'book',
 7: 'eyes',
 8: 'apple',
 9: 'like',
 10: 'i',
 11: 'milk',
 12: 'movie'}

In [33]:
# 将文本转化为数值
int_words = [vocab2int[w] for w in words]

In [34]:
int_words

[10,
 9,
 5,
 10,
 9,
 1,
 10,
 9,
 0,
 5,
 1,
 0,
 8,
 1,
 5,
 9,
 5,
 3,
 11,
 9,
 5,
 1,
 7,
 9,
 10,
 9,
 8,
 8,
 10,
 4,
 8,
 10,
 12,
 6,
 2,
 9,
 1,
 5,
 4,
 1,
 5,
 9]

In [35]:
# 计算单词频次
int_word_counts = Counter(int_words)
total_count = len(int_words)
word_freqs = {w: c / total_count for w, c in int_word_counts.items()}

In [37]:
int_word_counts,total_count

(Counter({10: 6,
          9: 9,
          5: 7,
          1: 6,
          0: 2,
          8: 4,
          3: 1,
          11: 1,
          7: 1,
          4: 2,
          12: 1,
          6: 1,
          2: 1}),
 42)

In [41]:
word_freqs # values 合计=1

{10: 0.14285714285714285,
 9: 0.21428571428571427,
 5: 0.16666666666666666,
 1: 0.14285714285714285,
 0: 0.047619047619047616,
 8: 0.09523809523809523,
 3: 0.023809523809523808,
 11: 0.023809523809523808,
 7: 0.023809523809523808,
 4: 0.047619047619047616,
 12: 0.023809523809523808,
 6: 0.023809523809523808,
 2: 0.023809523809523808}

In [42]:
# 去除出现频次高的词汇
if DELETE_WORDS:
    t = 1e-5
    prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
    train_words = [w for w in int_words if random.random() < (1 - prob_drop[w])]  # 随机取句子
else:
    train_words = int_words

In [43]:
# 单词分布
word_freqs = np.array(list(word_freqs.values()))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = torch.from_numpy(unigram_dist ** 0.75 / np.sum(unigram_dist ** 0.75))

In [46]:
word_freqs

array([0.14285714, 0.21428571, 0.16666667, 0.14285714, 0.04761905,
       0.0952381 , 0.02380952, 0.02380952, 0.02380952, 0.04761905,
       0.02380952, 0.02380952, 0.02380952])

In [47]:
unigram_dist

array([0.14285714, 0.21428571, 0.16666667, 0.14285714, 0.04761905,
       0.0952381 , 0.02380952, 0.02380952, 0.02380952, 0.04761905,
       0.02380952, 0.02380952, 0.02380952])

In [48]:
noise_dist

tensor([0.1306, 0.1770, 0.1466, 0.1306, 0.0573, 0.0963, 0.0341, 0.0341, 0.0341,
        0.0573, 0.0341, 0.0341, 0.0341], dtype=torch.float64)

In [49]:
train_words

[10,
 9,
 5,
 10,
 9,
 1,
 10,
 9,
 0,
 5,
 1,
 0,
 8,
 1,
 5,
 9,
 5,
 3,
 11,
 9,
 5,
 1,
 7,
 9,
 10,
 9,
 8,
 8,
 10,
 4,
 8,
 10,
 12,
 6,
 2,
 9,
 1,
 5,
 4,
 1,
 5,
 9]

In [50]:
n_batches = len(words) // BATCH_SIZE  # 训练的batches数
n_batches

8

In [51]:
words = words[:n_batches * BATCH_SIZE]  # 分n_batches批

In [53]:
len(words)

40

In [54]:
batch = words[0:0 + BATCH_SIZE]
batch

['i', 'like', 'dog', 'i', 'like']

In [57]:
# 获取目标词汇
def get_target(words, idx, WINDOW_SIZE):
    target_window = np.random.randint(1, WINDOW_SIZE + 1)
    start_point = idx - target_window if (idx - target_window) > 0 else 0
    end_point = idx + target_window
    targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
    return list(targets)


In [87]:
get_target(batch, 3, WINDOW_SIZE)

['like', 'i', 'dog']

In [100]:
idx=0
batch_x, batch_y = [], []
batch = words[idx:idx + BATCH_SIZE]  # ['i', 'like', 'dog', 'i', 'like']
for i in range(len(batch)):
    x = batch[i]
    y = get_target(batch, i, WINDOW_SIZE)
    batch_x.extend([x] * len(y))
    batch_y.extend(y)  # 每个 batch_x, batch_y (['i', 'i'], ['like', 'dog'])

In [101]:
batch_x,len(batch_x)

(['i', 'i', 'like', 'like', 'dog', 'dog', 'i', 'i', 'like'], 9)

In [102]:
batch_y,len(batch_y)

(['like', 'dog', 'i', 'dog', 'like', 'i', 'like', 'dog', 'i'], 9)

In [128]:
in_embed = nn.Embedding(20, 2)
in_embed

Embedding(20, 2)

In [126]:
in_embed.weight.shape

torch.Size([20, 2])

In [120]:
in_embed.weight.data

tensor([[ 0.6618,  0.0574],
        [-0.7599, -0.1682],
        [-0.3031, -0.6998],
        [ 0.3837,  0.3814],
        [-0.9982,  0.7549],
        [ 0.2456,  0.8541],
        [ 0.7077, -0.5567],
        [-0.8463, -0.4360],
        [-0.6380,  0.6013],
        [-0.5156, -0.2961],
        [ 0.1760, -0.0384],
        [-0.6878,  0.3729],
        [-0.1921, -0.8806],
        [ 0.5869, -0.7867],
        [-0.4817,  0.8569],
        [ 0.5214, -0.7104],
        [ 0.5345, -0.1481],
        [-0.2768, -0.2358],
        [ 0.7539, -0.6228],
        [-0.2884, -0.5124]])

In [123]:
in_embed.weight.data.uniform_(-1, 1)

tensor([[ 0.1081, -0.2417],
        [ 0.0288,  0.8138],
        [-0.0376, -0.8579],
        [ 0.4140, -0.0272],
        [-0.8545,  0.0154],
        [-0.0079, -0.3677],
        [ 0.3269,  0.4030],
        [ 0.1853, -0.5385],
        [ 0.9017, -0.8948],
        [-0.1547,  0.1846],
        [-0.5386, -0.0622],
        [ 0.5384,  0.8612],
        [ 0.9927, -0.8090],
        [ 0.8339,  0.1430],
        [ 0.4584, -0.7878],
        [-0.8090,  0.2035],
        [-0.3417, -0.8058],
        [-0.5695, -0.3639],
        [ 0.0519,  0.9526],
        [-0.1952, -0.5951]])

In [129]:
weights = torch.Tensor([0, 10, 3, 0]) # create a Tensor of weights

In [131]:
weights.shape

torch.Size([4])

In [132]:
noise_dist

tensor([0.1306, 0.1770, 0.1466, 0.1306, 0.0573, 0.0963, 0.0341, 0.0341, 0.0341,
        0.0573, 0.0341, 0.0341, 0.0341], dtype=torch.float64)

In [133]:
batch_x

['i', 'i', 'like', 'like', 'dog', 'dog', 'i', 'i', 'like']

In [134]:
batch_y

['like', 'dog', 'i', 'dog', 'like', 'i', 'like', 'dog', 'i']

In [138]:
words=train_words[:n_batches * BATCH_SIZE]

In [139]:
words

[10,
 9,
 5,
 10,
 9,
 1,
 10,
 9,
 0,
 5,
 1,
 0,
 8,
 1,
 5,
 9,
 5,
 3,
 11,
 9,
 5,
 1,
 7,
 9,
 10,
 9,
 8,
 8,
 10,
 4,
 8,
 10,
 12,
 6,
 2,
 9,
 1,
 5,
 4,
 1]

In [149]:
idx=0
batch_x, batch_y = [], []
# 如果传入的是train_words则是用Int表示。后面的备注同！
batch = words[idx:idx + BATCH_SIZE]  # ['i', 'like', 'dog', 'i', 'like']
for i in range(len(batch)):
    x = batch[i]
    y = get_target(batch, i, WINDOW_SIZE)
    batch_x.extend([x] * len(y))
    batch_y.extend(y)  # 每个 batch_x, batch_y (['i', 'i'], ['like', 'dog'])

In [150]:
batch_x,batch_y

([10, 9, 9, 9, 5, 5, 10, 10, 10, 9, 9], [9, 9, 10, 5, 9, 10, 9, 10, 5, 10, 5])

In [151]:
input_words, target_words = batch_x,batch_y

In [152]:
inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words)

In [153]:
inputs.shape

torch.Size([11])

In [155]:
in_embed = nn.Embedding(13, 2)
out_embed = nn.Embedding(13, 2)

In [157]:
inputs.shape

torch.Size([11])

In [159]:
input_vectors = in_embed(inputs)
input_vectors.shape

torch.Size([11, 2])

In [167]:
inputs.shape

torch.Size([11])

In [169]:
noise_dist.shape

torch.Size([13])

In [174]:
noise_words = torch.multinomial(noise_dist, 11 * 3, replacement=True)
noise_words,noise_words.shape

(tensor([12, 10,  3,  9, 11,  5, 11,  1, 10,  9,  4,  2,  2,  6,  1,  7,  2,  0,
          2,  3, 12,  4,  2,  0,  3,  2,  0,  0,  1,  1,  7,  1,  9]),
 torch.Size([33]))

In [183]:
noise_vectors = out_embed(noise_words).view(11,3,2)
noise_vectors.shape

torch.Size([11, 3, 2])

In [185]:
input_vectors.shape

torch.Size([11, 2])

In [188]:
input_vectors = input_vectors.view(11,2,1)

In [189]:
output_vectors.shape

NameError: name 'output_vectors' is not defined

In [192]:

output_vectors = out_embed(target_words)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [191]:
output_words

NameError: name 'output_words' is not defined

In [194]:
input_vectors.shape

torch.Size([11, 2, 1])

In [195]:
output_vectors=input_vectors.view(11,1,2)

In [196]:
output_vectors.shape

torch.Size([11, 1, 2])

In [209]:
out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
out_loss.shape

torch.Size([11, 1, 1])

In [211]:
out_loss.squeeze().shape

torch.Size([11])

In [215]:
noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()

In [217]:
noise_loss = noise_loss.squeeze()

In [219]:
noise_loss

tensor([[-0.1079, -0.4302, -1.0979],
        [-0.7239, -1.1712, -0.4102],
        [-1.1712, -0.8422, -0.4848],
        [-0.7239, -0.3641, -0.6064],
        [-0.2998, -2.6692, -0.3717],
        [-4.7493, -0.2998, -0.3206],
        [-2.7477, -1.0979, -0.1079],
        [-0.2689, -2.7477, -1.1097],
        [-1.0979, -2.7477, -1.1097],
        [-0.7805, -0.8422, -0.8422],
        [-0.2338, -0.8422, -0.7239]], grad_fn=<SqueezeBackward0>)

In [11]:
noise_loss.sum(1).shape

NameError: name 'noise_loss' is not defined

In [12]:
word_freqs

array([0.14285714, 0.21428571, 0.16666667, 0.14285714, 0.04761905,
       0.0952381 , 0.02380952, 0.02380952, 0.02380952, 0.04761905,
       0.02380952, 0.02380952, 0.02380952])

In [13]:
unigram_dist

array([0.14285714, 0.21428571, 0.16666667, 0.14285714, 0.04761905,
       0.0952381 , 0.02380952, 0.02380952, 0.02380952, 0.04761905,
       0.02380952, 0.02380952, 0.02380952])

In [28]:
noise_dist,noise_dist.shape

(tensor([0.1306, 0.1770, 0.1466, 0.1306, 0.0573, 0.0963, 0.0341, 0.0341, 0.0341,
         0.0573, 0.0341, 0.0341, 0.0341], dtype=torch.float64),
 torch.Size([13]))

In [26]:
torch.multinomial(noise_dist, 2 * 11, replacement=True).shape

torch.Size([22])