In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim

In [4]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return(log_probs)
    
#
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


def get_word_vec(word,word_to_ix,device="cpu"):
    context_ids = make_context_vector(word, word_to_ix).to(device)
    assert context_ids.is_cuda is True # returns a boolean
    return model.embeddings(context_ids)

In [3]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
print(f"vocab: {vocab}")

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(f"data:{data[:5]}")

#

print(f"make_context_vector: {make_context_vector(data[0][0], word_to_ix)}")


vocab: {'of', 'spells.', 'process.', 'direct', 'beings', 'evolve,', 'We', 'inhabit', 'the', 'processes.', 'our', 'processes', 'directed', 'idea', 'computational', 'As', 'rules', 'study', 'is', 'spirits', 'they', 'things', 'People', 'by', 'other', 'data.', 'manipulate', 'evolution', 'with', 'Computational', 'are', 'computers.', 'pattern', 'that', 'to', 'The', 'effect,', 'computer', 'a', 'programs', 'we', 'In', 'called', 'abstract', 'process', 'conjure', 'program.', 'about', 'create'}
data:[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]
make_context_vector: tensor([ 6, 30, 34, 17])


In [4]:
#
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), embedding_dim=10, context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [5]:
get_word_vec([data[0][0][0]],word_to_ix,device)

tensor([[-0.2045,  1.0123, -0.2335, -1.1488,  1.3003,  0.2334,  1.1850,  0.7958,
          0.4326,  0.1962]], device='cuda:0', grad_fn=<EmbeddingBackward>)

In [6]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_ids = make_context_vector(context, word_to_ix)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[tensor([241.7639]), tensor([146.7071]), tensor([55.8986]), tensor([11.7638]), tensor([3.5352]), tensor([1.4778]), tensor([1.1253]), tensor([0.9112]), tensor([0.7662]), tensor([0.6610])]


In [7]:
get_word_vec([data[0][0][0]],word_to_ix,device)


tensor([[-0.2357,  1.0574, -0.2192, -1.1787,  1.3493,  0.2485,  1.2024,  0.7948,
          0.4299,  0.1936]], device='cuda:0', grad_fn=<EmbeddingBackward>)

## 中文

In [8]:
!pip3 install jieba




In [6]:
import jieba

CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = "華航諾富特飯店群聚案新增首例外包商水電工案1145確診，中央流行疫情指揮中心持續追查水電工感染源，衛福部長陳時中與疾管署長周志浩今天上午一同到台大醫院參加「清潔雙手，攜手抗疫」記者會，會前陳時中受訪透露，案1145可能曾到過諾富特飯店一館B1用餐，不排除任何感染可能。指揮中心昨天公佈的案1145，是在諾富特飯店的一館3、5、6樓層（整修樓層）工作，由於和飯店人員、旅客都沒有接觸，因此感染源不明；不過陳時中今天表示，在施工的地方大部分都是工作人員，但是可能有在B1的地方共餐。媒體追問，是否表示案1145有去過B1，是不是可能被案1120或是餐飲部的員工感染？陳時中回應，各種可能性都有，但還需要相關的釐清。"
# raw_text = [t for t in text]
raw_text = list(jieba.cut(text))


# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
print(f"vocab: {vocab}")

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(f"data:{data[:5]}")

#

print(f"make_context_vector: {make_context_vector(data[0][0], word_to_ix)}")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.353 seconds.
Prefix dict has been built successfully.


vocab: {'（', '1145', '中央', '今天上午', '表示', '回應', '疾管署', '特飯', '共餐', '需要', '攜手', '包商', '但是', '到', '清潔', '旅客', '「', '媒體', '各種', '大部分', '一同', '記者', '追問', '、', '確診', '整修', '飯店', '排除', '用餐', '新增', '在', '施工', '是否', '的', '中心', '都', '會', '餐飲部', '於', '樓層', '5', '疫情', '參加', '案', '任何', '透露', '可能', '持續', '流行', '人員', '工案', '？', '與', '和', '追查', '昨天', '。', '清', '有', '受訪', '周志浩', '時', '特飯店', '是', '被案', '今天', '不過', '；', '陳', '因此', '抗疫', '去過', '1120', '6', '可能性', '源', '或是', '中', '不', '3', '沒', '相關', '諾富', '店', '由', '地方', '水電', '員工', '水電工', '指揮', '長', '一館', '例外', '公', '）', '台大', '過諾富', '會前', '不明', '群聚', '是不是', '工作', '但還', '釐', '雙手', '曾到', '衛福部長', '佈', '醫院', '，', 'B1', '華航諾富', '接觸', '首', '」', '感染'}
data:[(['華航諾富', '特飯', '群聚', '案'], '店'), (['特飯', '店', '案', '新增'], '群聚'), (['店', '群聚', '新增', '首'], '案'), (['群聚', '案', '首', '例外'], '新增'), (['案', '新增', '例外', '包商'], '首')]
make_context_vector: tensor([111,   7,  99,  43])


In [10]:
#
embedding_dim_num = 30
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), embedding_dim=embedding_dim_num, context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [11]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_ids = make_context_vector(context, word_to_ix)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[tensor([856.2447]), tensor([522.9579]), tensor([171.6826]), tensor([54.7270]), tensor([8.9709]), tensor([0.7604]), tensor([0.3382]), tensor([0.2860]), tensor([0.2510]), tensor([0.2251])]


In [12]:
print(f"text:{[data[0][0][0]]}, vec: {get_word_vec([data[0][0][0]],word_to_ix,device)}")

text:['華航諾富'], vec: tensor([[ 0.0701,  0.5675,  1.7060, -0.5189,  0.3842, -0.9304, -0.4696, -0.2314,
         -1.7867,  0.2036, -0.5264,  0.2282, -0.7893,  1.0054,  1.5950, -1.5383,
          1.0659, -0.4129, -1.0124, -0.1491, -1.4219,  0.7584, -0.8240, -0.3780,
          1.0067,  0.8340,  0.4849, -0.4860,  0.7368, -0.0794]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)


In [13]:
a = get_word_vec([data[0][0][0]],word_to_ix,device)
b = get_word_vec([data[0][0][1]],word_to_ix,device)
torch.cdist(a, b, p=embedding_dim_num)

tensor([[2.5567]], device='cuda:0', grad_fn=<CdistBackward>)

In [14]:
import torch
PATH_MODEL_SAVE = "./save_word2vec.pt"

# Save:
torch.save(model.state_dict(), PATH_MODEL_SAVE)

In [10]:
# Load:
CONTEXT_SIZE = 2
embedding_dim_num =30
PATH_MODEL_SAVE = "./save_word2vec.pt"
device = torch.device('cuda:0')


model = CBOW(len(vocab), embedding_dim=embedding_dim_num, context_size=CONTEXT_SIZE*2)
model.load_state_dict(torch.load(PATH_MODEL_SAVE))
model.to(device)
get_word_vec(["華航諾富"],word_to_ix,device)


tensor([[ 0.6609,  0.0558, -0.8872,  0.3004,  0.1367,  0.8624,  1.3616, -0.6265,
         -0.2723,  1.1247,  0.3074,  0.4339,  0.3673, -0.1834, -0.1669, -1.6277,
         -0.6405,  1.4663, -1.0495, -1.2386, -0.5605,  0.8007,  0.0929,  0.0429,
         -1.1079, -0.3637, -1.1880, -0.8322, -0.5966,  1.3929]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)