<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/CBOWs_SkipGram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# The code implementation is from https://janghan-kor.tistory.com/586
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7910441fbd10>

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split() ## This is our corpus

# By deriving a set from `raw_text`, we deduplicate the array
'''
첫 번째 작업. vocavluary를 만듦. (0, 0, 0, 0, 0, 0, 0) 단어를 몇 번째 index만 1로 바꾸는 One-hot encoding을 사용.

we = (1, 0, 0, 0, 0, 0, 0)
are = (0, 1, 0, 0, 0, 0, 0)
about = (0, 0, 1, 0, 0, 0, 0)
to = (0, 0, 0, 1, 0, 0, 0)
이렇게 저장하는 것은 힘듦.
'''
# text를 그냥 split하면 모든 것이 다 나옴.
vocab = set(text) # set을 사용해서 vocalvulary로 만듦.

vocab_size = len(vocab)
print('vocab_size:', vocab_size)


# 앞에서부터 하나씩 배정
w2i = {w: i for i, w in enumerate(vocab)}
i2w = {i: w for i, w in enumerate(vocab)}

# 자연어에서 가장 먼저하는 것이 이러한 vocalvulary를 만드는 것이다.
print(w2i)
print(i2w)

vocab_size: 49
{'programs': 0, 'Computational': 1, 'that': 2, 'by': 3, 'People': 4, 'abstract': 5, 'of': 6, 'process': 7, 'the': 8, 'computational': 9, 'are': 10, 'evolution': 11, 'our': 12, 'idea': 13, 'a': 14, 'to': 15, 'manipulate': 16, 'things': 17, 'inhabit': 18, 'processes.': 19, 'As': 20, 'The': 21, 'rules': 22, 'about': 23, 'direct': 24, 'create': 25, 'conjure': 26, 'pattern': 27, 'program.': 28, 'We': 29, 'process.': 30, 'spells.': 31, 'computers.': 32, 'data.': 33, 'is': 34, 'processes': 35, 'evolve,': 36, 'we': 37, 'they': 38, 'spirits': 39, 'with': 40, 'directed': 41, 'called': 42, 'study': 43, 'other': 44, 'In': 45, 'effect,': 46, 'beings': 47, 'computer': 48}
{0: 'programs', 1: 'Computational', 2: 'that', 3: 'by', 4: 'People', 5: 'abstract', 6: 'of', 7: 'process', 8: 'the', 9: 'computational', 10: 'are', 11: 'evolution', 12: 'our', 13: 'idea', 14: 'a', 15: 'to', 16: 'manipulate', 17: 'things', 18: 'inhabit', 19: 'processes.', 20: 'As', 21: 'The', 22: 'rules', 23: 'about',

In [6]:
# context window size is two


# input : t-2, t-1, t+1, t+2
# Output : t
# 각 단어가 들어오면 목적으로 하는 Output의 값을 정하면 양쪽 2개씩의 값이 필요.

def create_cbow_dataset(text):
    data = []
    for i in range(2, len(text) - 2): # 0번째는는 앞에에 두두 개개 없음음.
        context = [text[i - 2], text[i - 1],
                   text[i + 1], text[i + 2]]
        target = text[i] # 나머지는 context, 현재 t번째를 target으로 지정해서 데이터를 만들겠다.
        data.append((context, target))
    return data

'''
우리는 input은 현재 단어, Output은 4개.

보통 4개의 output으로 하지는 않고,
t - > t-2
t - > t-1
t - > t + 1
t - > t + 2
입력 값에 대한 context를 학습.
'''
# input : t
# Output : t-2, t-1, t+1, t+2

def create_skipgram_dataset(text):
    import random
    data = []
    for i in range(2, len(text) - 2):
        data.append((text[i], text[i-2], 1))
        data.append((text[i], text[i-1], 1))
        data.append((text[i], text[i+1], 1))
        data.append((text[i], text[i+2], 1))
        # negative sampling
        for _ in range(4):
            if random.random() < 0.5 or i >= len(text) - 3:
                rand_id = random.randint(0, i-1)
            else:
                rand_id = random.randint(i+3, len(text)-1)
            data.append((text[i], text[rand_id], 0))
    return data

cbow_train = create_cbow_dataset(text)
skipgram_train = create_skipgram_dataset(text)
print('cbow sample', cbow_train[0])
print('skipgram sample', skipgram_train[0])
print(skipgram_train[-1])


# ################################
# =>We, about, he
# [1,0,0] =>0
# [0,1,0] =>1
# [0,0,1] =>2


# W =[ [0.1, 0.2, 0.3],
#   [-0.4, 0.5, 0.6],
#   [0.01, 0.02, 0.03]]

# ([0,1,0], [1,0,0], 1)
# ([1,0],1)
# x = [1,0]=> model -> W
# ==>Wx ==> [-0.4, 0.5, 0.6], [0.1, 0.2, 0.3]

sentence ="We are about to study"

'''

window = 1

[We, are, 1]
[are, We, 1]
[are, about, 1]
[about, are, 1]
[about, to, 1]
[study, to, 1]

[We, study, 0]
[about, We, 0]

W1 [5,3]
W2 [5,3]


W1 = [[0.1, 0.2, 0.3],
      [0.4, 0.5, 0.6],
      [0.7, 0.8, 0.9],
      [...],
      [...]]
W2 = [[0.01, 0.02, 0.03],
      [],
      [],
      [],
      []]
We : 0
are: 1
about: 2
to : 3
study :4


[We, are, 1]

[1,0,0,0,0] @ W1 => [0.1, 0.2, 0.3]
[0] => nn.Embedding => [0.1,0.2,0.3]

[We, are, about, to, study]
[[1,0,0,0,0],
 [0,1,0,0,0],
 ..]

[1,1,1,1,1]
[0,1,2,3,4]




[[are, about],We, 1]

[0.4, 0.5, 0.6],
[0.7, 0.8, 0.9]

[0.14,0.33,0.25]

[0.01, 0.02, 0.03]


[We, are, 1]
We: [0.1, 0.2, 0.3]


[0.1, 0.2, 0.3] * W2
|V| => [0.6, 0.3, 0.4, 0.2, 0.1] => [0,1,0,0,0]

[we, about]
[w, e]
[ab, out, 1]

'''

cbow sample (['We', 'are', 'to', 'study'], 'about')
skipgram sample ('about', 'We', 1)
('with', 'rules', 0)


'\n\nwindow = 1\n\n[We, are, 1]\n[are, We, 1]\n[are, about, 1]\n[about, are, 1]\n[about, to, 1]\n[study, to, 1]\n\n[We, study, 0]\n[about, We, 0]\n\nW1 [5,3]\nW2 [5,3]\n\n\nW1 = [[0.1, 0.2, 0.3],\n      [0.4, 0.5, 0.6],\n      [0.7, 0.8, 0.9],\n      [...],\n      [...]]\nW2 = [[0.01, 0.02, 0.03],\n      [],\n      [],\n      [],\n      []]\nWe : 0\nare: 1\nabout: 2\nto : 3\nstudy :4\n\n\n[We, are, 1]\n\n[1,0,0,0,0] @ W1 => [0.1, 0.2, 0.3]\n[0] => nn.Embedding => [0.1,0.2,0.3]\n\n[We, are, about, to, study]\n[[1,0,0,0,0],\n [0,1,0,0,0],\n ..]\n\n[1,1,1,1,1]\n[0,1,2,3,4]\n\n\n\n\n[[are, about],We, 1]\n\n[0.4, 0.5, 0.6],\n[0.7, 0.8, 0.9]\n\n[0.14,0.33,0.25]\n\n[0.01, 0.02, 0.03]\n\n\n[We, are, 1]\nWe: [0.1, 0.2, 0.3]\n\n\n[0.1, 0.2, 0.3] * W2\n|V| => [0.6, 0.3, 0.4, 0.2, 0.1] => [0,1,0,0,0]\n\n[we, about]\n[w, e]\n[ab, out, 1]\n\n'

In [7]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size) # input이이 4개개 에에 대해해 hidden layer 지났다가가 감
        self.linear2 = nn.Linear(hidden_size, vocab_size) # (0, 0, 0, 0, 0, 1, 0, 0) # 6번째째 인덱스를를 갖는 단어다.

    def forward(self, inputs):
        embedded = self.embeddings(inputs).view((1, -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        log_probs = F.log_softmax(out)
        return log_probs

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        # [0]
        # [1,0,0,0,0] W


    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))
        score = torch.mm(embed_focus, torch.t(embed_ctx))
        log_probs = F.logsigmoid(score)

        return log_probs

In [8]:
embd_size = 100
learning_rate = 0.001
n_epoch = 30

def train_cbow():
    hidden_size = 64
    losses = []
    # NLLLoss + softmax function = cross-entropy loss (https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html)
    # https://discuss.pytorch.org/t/difference-between-cross-entropy-loss-or-log-likelihood-loss/38816/2
    loss_fn = nn.NLLLoss()
    model = CBOW(vocab_size, embd_size, CONTEXT_SIZE, hidden_size)
    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(n_epoch):
        total_loss = .0
        for context, target in cbow_train:
            ctx_idxs = [w2i[w] for w in context]
            ctx_var = Variable(torch.LongTensor(ctx_idxs))

            model.zero_grad()
            log_probs = model(ctx_var)

            loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]])))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        losses.append(total_loss)
    return model, losses

def train_skipgram():
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocab_size, embd_size)
    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(n_epoch):
        total_loss = .0
        for in_w, out_w, target in skipgram_train:
            in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
            out_w_var = Variable(torch.LongTensor([w2i[out_w]]))

            model.zero_grad()
            log_probs = model(in_w_var, out_w_var)
            loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        losses.append(total_loss)
    return model, losses

cbow_model, cbow_losses = train_cbow()
sg_model, sg_losses = train_skipgram()


CBOW(
  (embeddings): Embedding(49, 100)
  (linear1): Linear(in_features=400, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=49, bias=True)
)


  log_probs = F.log_softmax(out)


SkipGram(
  (embeddings): Embedding(49, 100)
)


In [9]:
# test
# You have to use other dataset for test, but in this case I use training data because this dataset is too small
def test_cbow(test_data, model):
    print('====Test CBOW===')
    correct_ct = 0
    for ctx, target in test_data:
        ctx_idxs = [w2i[w] for w in ctx]
        ctx_var = Variable(torch.LongTensor(ctx_idxs))

        model.zero_grad()
        log_probs = model(ctx_var)
        _, predicted = torch.max(log_probs.data, 1)
        predicted_word = i2w[predicted.item()] # predicted는 tensor임. 이것을을 어떠한한 값으로로 바꿔야함함. 그것이이 Item.
        print('predicted:', predicted_word)
        print('label    :', target)
        if predicted_word == target:
            correct_ct += 1

    print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))

def test_skipgram(test_data, model):
    print('====Test SkipGram===')
    correct_ct = 0
    for in_w, out_w, target in test_data:
        in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
        out_w_var = Variable(torch.LongTensor([w2i[out_w]]))

        model.zero_grad()
        log_probs = model(in_w_var, out_w_var)
        _, predicted = torch.max(log_probs.data, 1)
        predicted = predicted[0]
        predicted_word = i2w[predicted.item()] # predicted는 tensor임. 이것을을 어떠한한 값으로로 바꿔야함함. 그것이이 Item.
        if predicted == target:
            correct_ct += 1

    print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))

test_cbow(cbow_train, cbow_model)
print('------')
test_skipgram(skipgram_train, sg_model)

====Test CBOW===
predicted: about
label    : about
predicted: to
label    : to
predicted: study
label    : study
predicted: the
label    : the
predicted: idea
label    : idea
predicted: of
label    : of
predicted: a
label    : a
predicted: computational
label    : computational
predicted: process.
label    : process.
predicted: Computational
label    : Computational
predicted: processes
label    : processes
predicted: are
label    : are
predicted: abstract
label    : abstract
predicted: beings
label    : beings
predicted: that
label    : that
predicted: inhabit
label    : inhabit
predicted: computers.
label    : computers.
predicted: As
label    : As
predicted: they
label    : they
predicted: evolve,
label    : evolve,
predicted: processes
label    : processes
predicted: manipulate
label    : manipulate
predicted: other
label    : other
predicted: abstract
label    : abstract
predicted: things
label    : things
predicted: called
label    : called
predicted: a
label    : data.
predicted

  log_probs = F.log_softmax(out)
