In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim

In [2]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return(log_probs)
    
#
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


def get_word_vec(word,word_to_ix,device="cpu"):
    context_ids = make_context_vector(word, word_to_ix).to(device)
    assert context_ids.is_cuda is True # returns a boolean
    return model.embeddings(context_ids)

In [3]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
print(f"vocab: {vocab}")

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(f"data:{data[:5]}")

#

print(f"make_context_vector: {make_context_vector(data[0][0], word_to_ix)}")


vocab: {'computers.', 'computer', 'inhabit', 'As', 'evolution', 'program.', 'computational', 'direct', 'effect,', 'Computational', 'rules', 'process', 'things', 'processes.', 'study', 'processes', 'to', 'called', 'our', 'pattern', 'of', 'that', 'spirits', 'is', 'process.', 'a', 'about', 'The', 'We', 'data.', 'we', 'abstract', 'beings', 'by', 'People', 'conjure', 'spells.', 'they', 'evolve,', 'are', 'with', 'manipulate', 'programs', 'idea', 'other', 'In', 'directed', 'the', 'create'}
data:[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]
make_context_vector: tensor([28, 39, 16, 14])


In [4]:
#
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), embedding_dim=10, context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [5]:
get_word_vec([data[0][0][0]],word_to_ix,device)

tensor([[-0.9782,  1.1687, -1.5671, -1.0935,  0.0887,  0.7096, -1.6064, -0.1839,
          0.8062, -0.3231]], device='cuda:0', grad_fn=<EmbeddingBackward>)

In [6]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_ids = make_context_vector(context, word_to_ix)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[tensor([246.4905]), tensor([134.8672]), tensor([35.3014]), tensor([3.1480]), tensor([1.5589]), tensor([1.1555]), tensor([0.9227]), tensor([0.7691]), tensor([0.6592]), tensor([0.5765])]


In [7]:
get_word_vec([data[0][0][0]],word_to_ix,device)


tensor([[-1.0039,  1.1754, -1.5717, -1.1152,  0.1006,  0.7099, -1.6132, -0.1806,
          0.8196, -0.3096]], device='cuda:0', grad_fn=<EmbeddingBackward>)

## 中文

In [8]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = [t for t in "華航諾富特飯店群聚案新增首例外包商水電工案1145確診，中央流行疫情指揮中心持續追查水電工感染源，衛福部長陳時中與疾管署長周志浩今天上午一同到台大醫院參加「清潔雙手，攜手抗疫」記者會，會前陳時中受訪透露，案1145可能曾到過諾富特飯店一館B1用餐，不排除任何感染可能。指揮中心昨天公佈的案1145，是在諾富特飯店的一館3、5、6樓層（整修樓層）工作，由於和飯店人員、旅客都沒有接觸，因此感染源不明；不過陳時中今天表示，在施工的地方大部分都是工作人員，但是可能有在B1的地方共餐。媒體追問，是否表示案1145有去過B1，是不是可能被案1120或是餐飲部的員工感染？陳時中回應，各種可能性都有，但還需要相關的釐清。"]

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
print(f"vocab: {vocab}")

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(f"data:{data[:5]}")

#

print(f"make_context_vector: {make_context_vector(data[0][0], word_to_ix)}")

vocab: {'透', '志', '持', '任', '地', '心', '示', '管', '外', '問', '揮', '但', '前', '院', '診', '浩', '署', '到', '指', '諾', '福', '關', '共', '、', '飯', '時', '記', '者', '接', '觸', '4', '用', '潔', '疫', '於', '大', '感', '商', '館', '媒', '今', '餐', '能', '；', '中', '修', '「', '露', '續', '，', '作', '陳', '清', '華', '表', '航', '不', '否', '情', '明', '釐', '攜', '何', '有', '方', '會', '台', '可', '參', '查', '。', '0', '曾', '6', '去', '或', '追', '午', '和', '長', '增', '公', '手', '層', '店', '受', '周', '訪', '央', '3', '抗', '的', '沒', '性', '？', '染', '富', '首', '水', '天', '部', '還', '客', '雙', 'B', '1', '案', '員', '種', '過', '除', '回', '一', '新', '衛', '是', '此', '昨', '飲', '疾', '各', '與', '需', '被', '包', '聚', '源', '應', '因', '工', '施', '確', '）', '由', '5', '2', '都', '整', '（', '」', '排', '同', '上', '分', '體', '特', '佈', '例', '加', '流', '行', '人', '旅', '要', '醫', '在', '樓', '群', '相', '電'}
data:[(['華', '航', '富', '特'], '諾'), (['航', '諾', '特', '飯'], '富'), (['諾', '富', '飯', '店'], '特'), (['富', '特', '店', '群'], '飯'), (['特', '飯', '群', '聚'], '店')]
make_context_vector: tensor([ 53,  55,  9

In [9]:
#
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), embedding_dim=10, context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [10]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_ids = make_context_vector(context, word_to_ix)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[tensor([1458.3297]), tensor([890.8279]), tensor([376.6428]), tensor([125.4319]), tensor([141.4768]), tensor([135.7821]), tensor([59.3746]), tensor([89.7570]), tensor([68.6293]), tensor([14.3338])]


In [12]:
print(f"text:{[data[0][0][0]]}, vec: {get_word_vec([data[0][0][0]],word_to_ix,device)}")

text:['華'], vec: tensor([[-0.6937, -0.4363,  0.2154, -2.3111, -1.4362,  0.1475,  0.5455,  1.2378,
         -1.5731,  0.3879]], device='cuda:0', grad_fn=<EmbeddingBackward>)
