## Example Sentence

In [1]:
example_sentence = """In the case of CBOW, one word is eliminated, and the word is predicted from surrounding words.
Therefore, it takes multiple input vectors as inputs to the model and creates one output vector.
In contrast, Skip-Gram learns by removing all words except one word and predicting the surrounding words in the context through one word. 
So, it takes a vector as input and produces multiple output vectors.
CBOW and Skip-Gram are different.""".split()

# CBOW

In [2]:
import torch
import torch.nn as nn

def CBOW_make_context_vector(context, word_to_idx):
    idxs = [word_to_idx[word] for word in context]
    return torch.LongTensor(idxs)

def CBOW_make_data(sentence):
    data = []
    for i in range(2, len(example_sentence) - 2):
        context = [example_sentence[i - 2], example_sentence[i - 1], example_sentence[i + 1], example_sentence[i + 2]]
        target = example_sentence[i]
        data.append((context, target))
    return data

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.layer1 = nn.Linear(embedding_dim, 64)
        self.activation1 = nn.ReLU()
        
        self.layer2 = nn.Linear(64, vocab_size)
        self.activation2 = nn.LogSoftmax(dim = -1)

    def forward(self, X):
        X = sum(self.embeddings(X)).view(1, -1)
        X = self.activation1(self.layer1(X))
        X = self.activation2(self.layer2(X))
        return X
        

EMBEDDING_DIM = 128
Epochs = 1000

vocab = set(example_sentence)
vocab_size = len(example_sentence)

word_to_idx = {word : idx for idx, word in enumerate(vocab)}
idx_to_word = {idx : word for idx, word in enumerate(vocab)}

data = CBOW_make_data(example_sentence)

model = CBOW(vocab_size, EMBEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

for epoch in range(Epochs):
    total_loss = 0
    for context, target in data:
        context_vector = CBOW_make_context_vector(context, word_to_idx)
        log_probs = model(context_vector)
        total_loss += loss_function(log_probs, torch.tensor([word_to_idx[target]]))
    if epoch % 100 == 0:
        print(f'Epoch = {epoch} | Loss = {total_loss}')
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    
test_data = ['CBOW', 'and', 'are', 'different.']
test_vector = CBOW_make_context_vector(test_data, word_to_idx)
result = model(test_vector)
print(f"Prediction : {idx_to_word[torch.argmax(result[0]).item()]}")

Epoch = 0 | Loss = 302.3144836425781
Epoch = 100 | Loss = 223.91552734375
Epoch = 200 | Loss = 152.63931274414062
Epoch = 300 | Loss = 94.65900421142578
Epoch = 400 | Loss = 54.733489990234375
Epoch = 500 | Loss = 31.783037185668945
Epoch = 600 | Loss = 19.998750686645508
Epoch = 700 | Loss = 13.860055923461914
Epoch = 800 | Loss = 10.344636917114258
Epoch = 900 | Loss = 8.128987312316895
Prediction : Skip-Gram


# Skip-gram

In [3]:
import torch
import torch.nn as nn

EMBEDDING_DIM = 128
EPOCHS = 1000
CONTEXT_SIZE = 4

def Skip_gram_make_context_vector(context, word_to_idx):
    idxs = word_to_idx[context]
    return torch.tensor(idxs, dtype=torch.long)

def Skip_gram_make_data(sentence):
    data = []
    for i in range(2, len(example_sentence) - 2):
        # CBOW와 context, target 분리 작업 과정이 반대
        context = example_sentence[i]
        target = [example_sentence[i - 2], example_sentence[i - 1], example_sentence[i + 1], example_sentence[i + 2]]
        data.append((context, target))
    return data

class SKIP_GRAM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(SKIP_GRAM, self).__init__()
        self.context_size = context_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.layer1 = nn.Linear(embedding_dim, 64)
        self.activation1 = nn.ReLU()

        self.layer2 = nn.Linear(64, vocab_size * context_size)
        self.activation2 = nn.LogSoftmax(dim = -1)

    def forward(self, X):
        X = self.embeddings(X)
        X = self.activation1(self.layer1(X))
        X = self.activation2(self.layer2(X))
        return X.view(self.context_size, vocab_size)

vocab = set(example_sentence)
vocab_size = len(example_sentence)

word_to_idx = {word : idx for idx, word in enumerate(vocab)}
idx_to_word = {idx : word for idx, word in enumerate(vocab)}

data = Skip_gram_make_data(example_sentence)

model = SKIP_GRAM(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in data:
        context_vector = Skip_gram_make_context_vector(context, word_to_idx)
        log_probs = model(context_vector)
        total_loss += loss_function(log_probs, torch.LongTensor([word_to_idx[t] for t in target]))

    if epoch % 100 == 0:
        print(f"Epoch : {epoch} | Loss : {total_loss}")
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

test_data = 'Skip-Gram'
test_vector = Skip_gram_make_context_vector(test_data, word_to_idx)
result = model(test_vector)
print(f"Prediction : {[idx_to_word[torch.argmax(r).item()] for r in result]}")

Epoch : 0 | Loss : 389.0058898925781
Epoch : 100 | Loss : 328.5797119140625
Epoch : 200 | Loss : 232.27313232421875
Epoch : 300 | Loss : 163.70765686035156
Epoch : 400 | Loss : 141.14642333984375
Epoch : 500 | Loss : 134.49583435058594
Epoch : 600 | Loss : 132.18878173828125
Epoch : 700 | Loss : 131.12271118164062
Epoch : 800 | Loss : 130.5124969482422
Epoch : 900 | Loss : 130.1202850341797
Prediction : ['CBOW', 'and', 'learns', 'by']
