<a href="https://colab.research.google.com/github/1190303311/AI/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import copy
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

C=3
K=15
epochs = 2
MAX_VOCAB_SIZE = 10000
EMBEDDING_SIZE = 100
batch_size = 64
lr = 0.2

with open('text8.train.txt') as f:
    text = f.read() # 得到文本内容

text = text.lower().split() #　分割成单词列表
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
vocab_dict['UNK'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)

In [None]:
class WordEmbeddingDataset(tud.Dataset):
  def __init__(self, text, word2idx, idx2word, word_freqs, word_counts):
    super(WordEmbeddingDataset,self).__init__()
    self.text_encoded = [word2idx.get(word, word2idx['UNK']) for word in text]
    self.text_encoded = torch.LongTensor(self.text_encoded)
    self.word2idx = word2idx
    self.id2word = idx2word
    self.word_freqs = torch.Tensor(word_freqs)
    self.word_counts = torch.Tensor(word_counts)

  def __len__(self):
    return len(self.text_encoded)

  def __getitem__(self, idx):
    center_words = self.text_encoded[idx]
    pos_indices = list(range(idx-C, idx))+list(range(idx+1, idx+C+1))
    pos_indices = [i%len(self.text_encoded) for i in pos_indices]
    pos_words = self.text_encoded[pos_indices]

    #mask = pos_words.numpy().tolist()
    #mask.append(center_words.numpy().tolist())
    #freq_list = [freq if i not in mask else torch.tensor(0.) for i,freq in enumerate(self.word_freqs)] 
    #freq_list = torch.Tensor(freq_list)
    neg_words = torch.multinomial(self.word_freqs, K*pos_words.shape[0],True)
    #while len(set(pos_words.numpy().tolist()) & set(neg_words.numpy().tolist())) > 0:
     #       neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
    
    return center_words, pos_words, neg_words

dataset = WordEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)
next(iter(dataset))


In [None]:
a = np.array([1])
a.tolist()

In [None]:
next(iter(dataset))

In [None]:
class EmbeddingModel(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super(EmbeddingModel, self).__init__()

    self.vocab_size = vocab_size
    self.embed_size = embed_size

    self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
    self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

  def forward(self, input_labels, pos_labels, neg_labels):
    input_embedding = self.in_embed(input_labels)
    pos_embedding = self.out_embed(pos_labels)
    neg_embedding = self.out_embed(neg_labels)

    input_embedding = input_embedding.unsqueeze(2)

    pos_dot = torch.bmm(pos_embedding, input_embedding)
    pos_dot = pos_dot.squeeze(2)

    neg_dot = torch.bmm(neg_embedding, -input_embedding)
    neg_dot = neg_dot.squeeze(2)

    log_pos = F.logsigmoid(pos_dot).sum(1)
    log_neg = F.logsigmoid(neg_dot).sum(1)


    loss = log_pos+log_neg

    return -loss

  def input_embedding(self):
    return self.in_embed.weight.cpu().detach().numpy()

model = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for e in range(1):
  for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
    input_labels = input_labels.long().to(device)
    pos_labels = pos_labels.long().to(device)
    neg_labels = neg_labels.long().to(device)
    
    optimizer.zero_grad()
    loss = model(input_labels, pos_labels, neg_labels).mean()
    loss.backward()

    optimizer.step()

    if i%100==0:
      print('epoch',e,'iteration', i,loss.item())

embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

In [None]:
def find_nearest(word):
  index = word2idx[word]
  embedding = embedding_weights[index]
  cos_dis = np.array([scipy.spatial.distance.cosine(e,embedding) for e in embedding_weights])
  return [idx2word[i] for i in cos_dis.argsort()[:10]]

for word in ['two', 'america', 'computer']:
  print(word, find_nearest(word))


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

C=3
K=15
epochs=2
MAX_VOCAB_SIZE=10000
EMBEDDING_SIZE=100
batch_size=32
lr=0.2

with open('text8.train.txt') as f:
    text = f.read() # 得到文本内容

text = text.lower().split() #　分割成单词列表
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
vocab_dict['UNK'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)

In [None]:
class CbowEmbeddingDataset(tud.Dataset):
  def __init__(self, text, word2idx, idx2word, word_freqs, word_counts):
    super(CbowEmbeddingDataset, self).__init__()
    self.text_encoded = [word2idx.get(word, word2idx['UNK']) for word in text]
    self.text_encoded = torch.LongTensor(self.text_encoded)
    self.word2idx = word2idx
    self.idx2word = idx2word
    self.word_freqs = torch.Tensor(word_freqs)
    self.word_counts = torch.Tensor(word_counts)

  def __len__(self):
    return len(self.text_encoded)

  def __getitem__(self, idx):
    self.center_words = self.text_encoded[idx]
    self.pos_indices = list(range(idx-C, idx))+list(range(idx+1, idx+C+1))
    self.pos_indices = [i%len(self.text_encoded) for i in self.pos_indices]
    self.pos_words = self.text_encoded[self.pos_indices]

    self.neg_words = torch.multinomial(self.word_freqs, K, True)

    return self.center_words, self.pos_words, self.neg_words

dataset = CbowEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)
next(iter(dataset))



(tensor(4813),
 tensor([  50, 9999,  393, 3139,   11,    5]),
 tensor([   5,   75,   87,   35, 1426,   13,  800, 3077, 1624, 1006, 1094,  278,
         3221,  456,    8]))

In [None]:
class CbowModel(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super(CbowModel,self).__init__()
    self.vocab_size = vocab_size
    self.embed_size = embed_size
    self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
    self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

  def forward(self, center_ids, back_ids, neg_ids):
    center_embed = self.out_embed(center_ids) #batch_size, embed_size
    neg_embed = self.out_embed(neg_ids)    #batch_size, K, embed_size
    back_embed = self.in_embed(back_ids)   #batch_size, 2C, embed_size

    center_embed = center_embed.unsqueeze(2)  #batch_size, embed_size, 1

    pos_dot = torch.bmm(back_embed, center_embed)#batch_size, 2C, 1
    pos_dot = pos_dot.squeeze(2)

    neg_embed = neg_embed.transpose(1,2)    #batch_size, embed_size, K
    neg_dot = torch.bmm(back_embed, -neg_embed)#batch_size, 2C, K
    neg_dot = neg_dot.reshape(batch_size, -1) #batch_size, 2C*K

    pos_prob = F.logsigmoid(pos_dot).sum(1)
    neg_prob = F.logsigmoid(neg_dot).sum(1)

    loss = pos_prob+neg_prob

    return -loss

  def input_embedding(self):
    return self.out_embed.weight.cpu().detach().numpy()

model = CbowModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for e in range(1):
  for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
    input_labels = input_labels.long().to(device)
    pos_labels = pos_labels.long().to(device)
    neg_labels = neg_labels.long().to(device)
    
    optimizer.zero_grad()
    loss = model(input_labels, pos_labels, neg_labels).mean()
    loss.backward()

    optimizer.step()

    if i%100==0:
      print('epoch',e,'iteration', i,loss.item())

embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

cpu
epoch 0 iteration 0 379.6021423339844
epoch 0 iteration 100 358.88739013671875
epoch 0 iteration 200 350.2727966308594
epoch 0 iteration 300 312.32708740234375
epoch 0 iteration 400 319.50543212890625
epoch 0 iteration 500 311.83587646484375
epoch 0 iteration 600 315.26409912109375
epoch 0 iteration 700 282.15716552734375
epoch 0 iteration 800 272.6908264160156
epoch 0 iteration 900 258.32196044921875
epoch 0 iteration 1000 257.9416809082031
epoch 0 iteration 1100 263.0829162597656
epoch 0 iteration 1200 242.3739776611328
epoch 0 iteration 1300 230.03262329101562
epoch 0 iteration 1400 253.81820678710938
epoch 0 iteration 1500 219.8492431640625
epoch 0 iteration 1600 224.54457092285156
epoch 0 iteration 1700 212.82162475585938
epoch 0 iteration 1800 195.98414611816406
epoch 0 iteration 1900 187.49118041992188
epoch 0 iteration 2000 198.58164978027344
epoch 0 iteration 2100 180.37347412109375
epoch 0 iteration 2200 167.94198608398438
epoch 0 iteration 2300 185.8711395263672
epoch 0 

RuntimeError: ignored

In [None]:
def find_nearest(word):
  index = word2idx[word]
  embedding = embedding_weights[index]
  cos_dis = np.array([scipy.spatial.distance.cosine(e,embedding) for e in embedding_weights])
  return [idx2word[i] for i in cos_dis.argsort()[:10]]

for word in ['two', 'america', 'computer']:
  print(word, find_nearest(word))


NameError: ignored