In [22]:
import torch
import torch.nn as nn
import numpy as np
import collections
import os
import urllib
import zipfile

In [4]:
#torch.nn.Embedding(n,m)是用于词嵌入的,n是单词数，m是词向量维度数
# 创建一个long类型的张量，这是一个索引张量，比如词袋中元素只有四个数
# “你好吗我”在词袋中one-hot编码以后分别是0，1，2，3
# 那么word1这个列表代表的就是“你好吗”这个含义
word1 = torch.LongTensor([0,1,2])
word2 = torch.LongTensor([3,1,2])
# embedding则是创建一个参数可学习的随机张量矩阵，词袋有4个数，维度就是4，每个词向量维度是5
# word1和word2分别用索引的方式获取每个字的词向量
embedded = nn.Embedding(4,5)
print('init embed',embedded.weight)
print('word vector of word1 is',embedded(word1))#size为（3，5），表示3个字
embedded.weight.data.uniform_(-0.5,0.5)
print('new embed',embedded.weight)

init embed Parameter containing:
tensor([[ 1.7509, -0.7933, -0.9278,  0.0388, -0.4009],
        [ 1.3134,  3.2451, -0.0717, -1.4094, -0.0504],
        [ 0.1722, -0.6872,  0.6585, -0.2432,  0.5528],
        [-0.9784,  0.7325,  1.1104,  0.2580,  0.4777]], requires_grad=True)
word vector of word1 is tensor([[ 1.7509, -0.7933, -0.9278,  0.0388, -0.4009],
        [ 1.3134,  3.2451, -0.0717, -1.4094, -0.0504],
        [ 0.1722, -0.6872,  0.6585, -0.2432,  0.5528]],
       grad_fn=<EmbeddingBackward>)
new embed Parameter containing:
tensor([[ 0.0918, -0.0733,  0.1528, -0.2699,  0.4226],
        [ 0.4482, -0.0054, -0.1250,  0.4544, -0.2707],
        [ 0.1023, -0.0611,  0.3351, -0.2793,  0.2026],
        [-0.2983, -0.2673, -0.3272,  0.1086, -0.4682]], requires_grad=True)


In [4]:
#一个batch的矩阵相乘
batch_size = 3
embed_size = 5
context_num = 4
inputs = torch.randn(batch_size,embed_size,1)
mats = torch.randn(batch_size,context_num,embed_size)
result = torch.bmm(mats,inputs)#batch_matrix_multiply
result.shape

torch.Size([3, 4, 1])

In [3]:
mats = torch.randn(5,2,4)
trans = mats.view(5,1,-1)
print(trans.shape)

torch.Size([5, 1, 8])


In [13]:
table = np.random.random(5)
table /=np.sum(table)
print(table)
table_list = []
count = np.round(100*table)
for word_idx,c in enumerate(count):
    table_list+=[word_idx]*int(c)
print(table_list)
print(len(table_list))

[0.154236   0.09939146 0.33624985 0.14808191 0.26204078]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
100


In [21]:
# 对于NLP，文字初始化包括的操作有以下这些
# 样本用例来自于https://github.com/wentsun12/NLP_Learning/blob/master/skip_gram/skip-gram.py
text = "I like dog i like cat i like animal dog cat animal apple cat dog like dog fish milk like dog \
cat eyes like i like apple apple i hate apple i movie book music like cat dog hate cat dog like"
FREQ = 0
# 去除低频词
def preprocess(text, FREQ):
    text = text.lower()
    words = text.split()
    #去除低频词
    word_counts = collections.Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > FREQ]
    return trimmed_words
words = preprocess(text, FREQ)
print('words=',words)
# 构建数字版句子
vocab = set(words)
vocab2int = {w: c for c, w in enumerate(vocab)}#词典
int2vocab = {c: w for c, w in enumerate(vocab)}
print('vocab2int=',vocab2int)
#将文本转化为数值
int_words = [vocab2int[w] for w in words]
print('int_words=',int_words)
# 统计特定句子的各个单词频率
int_word_counts = collections.Counter(int_words)
total_count = len(int_words)
word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
print('word_freqs=',word_freqs)
# 去除高频词
DELETE_WORDS = False
if DELETE_WORDS:
    t = 1e-5
    prob_drop = {w: 1-np.sqrt(t/word_freqs[w]) for w in int_word_counts}
    train_words = [w for w in int_words if np.random.random()<(1-prob_drop[w])]
else:
    train_words = int_words  # [2,4,7,...]
print('train_word=',train_words)
# 改变词语频率，方便后面负采样
word_freqs = np.array(list(word_freqs.values()))
unigram_dist = word_freqs / word_freqs.sum()
noise_dist = torch.from_numpy(unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75)))


words= ['i', 'like', 'dog', 'i', 'like', 'cat', 'i', 'like', 'animal', 'dog', 'cat', 'animal', 'apple', 'cat', 'dog', 'like', 'dog', 'fish', 'milk', 'like', 'dog', 'cat', 'eyes', 'like', 'i', 'like', 'apple', 'apple', 'i', 'hate', 'apple', 'i', 'movie', 'book', 'music', 'like', 'cat', 'dog', 'hate', 'cat', 'dog', 'like']
vocab2int= {'fish': 0, 'movie': 1, 'book': 2, 'music': 3, 'dog': 4, 'eyes': 5, 'like': 6, 'hate': 7, 'i': 8, 'apple': 9, 'milk': 10, 'cat': 11, 'animal': 12}
int_words= [8, 6, 4, 8, 6, 11, 8, 6, 12, 4, 11, 12, 9, 11, 4, 6, 4, 0, 10, 6, 4, 11, 5, 6, 8, 6, 9, 9, 8, 7, 9, 8, 1, 2, 3, 6, 11, 4, 7, 11, 4, 6]
word_freqs= {8: 0.14285714285714285, 6: 0.21428571428571427, 4: 0.16666666666666666, 11: 0.14285714285714285, 12: 0.047619047619047616, 9: 0.09523809523809523, 0: 0.023809523809523808, 10: 0.023809523809523808, 5: 0.023809523809523808, 7: 0.047619047619047616, 1: 0.023809523809523808, 2: 0.023809523809523808, 3: 0.023809523809523808}
train_word= [8, 6, 4, 8, 6, 11, 8, 6

In [None]:
# 实际下载数据库操作大致如下，代码来自https://github.com/zhangxiann/Skip-gram
url='http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
    if not os.path.exists(filename):
        print('file not found')
        filename, _ = urllib.request.urlretrieve(url+filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise  Exception('Failed to verify '+filename+'. Can you get to it with a browser?')
    return filename

#filename=maybe_download('text8.zip', 90112)#31344016
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        # 读取出来的每个单词是 bytes
        data=f.read(f.namelist()[0]).split()
        # 把 bytes 转换为 str
        #data= [str(x, encoding = "utf8") for x in data]
        data = list(map(lambda x: str(x, encoding = "utf8"), data))
    return data

words=read_data('text8.zip')# 理论上words里面全是句子

print('Data size', len(words))

In [None]:
VOCABULARY_SIZE=50000
# 取出频数前 50000 的单词
counts_dict = dict((collections.Counter(words).most_common(VOCABULARY_SIZE-1)))# 词语-频数对
# 去掉频数小于 FREQ 的单词
# trimmed_words = [word for word in words if counts_dict[word] > FREQ]
# 计算 UNK 的频数 = 单词总数 - 前 50000 个单词的频数之和
counts_dict['UNK']=len(words)-np.sum(list(counts_dict.values()))
#建立词和索引的对应
idx_to_word = [word for word in counts_dict.keys()]# 把词典所有词都记录下来
word_to_idx = {word:i for i,word in enumerate(idx_to_word)}# 词语-idx对
data = [word_to_idx.get(word,word_to_idx["UNK"]) for word in words]# 一段文字变成一段idx句子，列表生成式
# 计算单词频次
total_count = len(data)
word_freqs = {w: c/total_count for w, c in counts_dict.items()}
# 以一定概率去除出现频次高的词汇
if DELETE_WORDS:
    t = 1e-5
    prob_drop = {w: 1-np.sqrt(t/word_freqs[w]) for w in data}
    data = [w for w in data if np.random.random()<(1-prob_drop[w])]
else:
    data = data
#计算词频,按照原论文转换为3/4次方
word_counts = np.array([count for count in counts_dict.values()],dtype=np.float32)
word_freqs = word_counts/np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs)

In [34]:
a=[22,23,24,25,26,27,22,23,19]
b=[1,2,3]
a[b]

TypeError: list indices must be integers or slices, not list