# CBOW 임베딩

In [42]:
import os

import numpy as np
import pandas as pd
import string

from argparse import Namespace
import collections

import nltk.data
import re
from tqdm import tqdm_notebook

In [43]:
# 어휘 사전 다운로드 받는 과정
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
args = Namespace(
    raw_dataset_txt = "frankenstein.txt",
    window_size = 5,
    train_proportion = 0.7,
    val_proportion = 0.15,
    test_proportion = 0.15,
    output_munged_csv = "frankenstein_with_splits.csv",
    seed = 1337
)

In [45]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [46]:
print(len(sentences), "sentences")

3427 sentences


In [47]:
# 샘플로 100번째 인덱스 확인
print("Sample:", sentences[100] )

Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [48]:
list = [word.lower() for word in sentences[100].split(" ")]
print(list)

['no', 'incidents', 'have', 'hitherto', 'befallen', 'us', 'that', 'would', 'make', 'a', 'figure', 'in', 'a\nletter.']


In [49]:
text = ' '.join(word.lower() for word in sentences[100].split(" "))
print(text)

no incidents have hitherto befallen us that would make a figure in a
letter.


In [50]:
# sentences 를 정규표현식을 사용해서 전처리
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [51]:
# 6번째와 다른 것은 마지막 letter전에 엔터 들어간것을 preprocess를 통해 지움
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
print(cleaned_sentences[100])

no incidents have hitherto befallen us that would make a figure in a letter . 


In [52]:
# Global vars
MASK_TOKEN = "<MASK>"

In [53]:
# windows 만들기(기존에 크기 5로 잡아놓음)
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(cleaned_sentences)])


  0%|          | 0/3427 [00:00<?, ?it/s]

In [54]:
# cbow 데이터 만들기
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for window in tqdm_notebook(windows):


  0%|          | 0/90698 [00:00<?, ?it/s]

In [55]:
# 사전에 만든 데이터를 판다스 dataframe 넣어서 처리
cbow_data = pd.DataFrame(data, columns=["context", "target"])

In [56]:
cbow_data

Unnamed: 0,context,target
0,", or the modern prometheus",frankenstein
1,frankenstein or the modern prometheus by,","
2,"frankenstein , the modern prometheus by mary",or
3,"frankenstein , or modern prometheus by mary wo...",the
4,"frankenstein , or the prometheus by mary wolls...",modern
...,...,...
90693,our email newsletter to hear new ebooks .,about
90694,email newsletter to hear about ebooks .,new
90695,newsletter to hear about new .,ebooks
90696,to hear about new ebooks,.


In [57]:
cbow_data.tail()

Unnamed: 0,context,target
90693,our email newsletter to hear new ebooks .,about
90694,email newsletter to hear about ebooks .,new
90695,newsletter to hear about new .,ebooks
90696,to hear about new ebooks,.
90697,hear about new ebooks .,


In [58]:
# split data 만들기
n = len(cbow_data)

In [59]:
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'

In [60]:
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [61]:
cbow_data.tail()

Unnamed: 0,context,target,split
90693,our email newsletter to hear new ebooks .,about,test
90694,email newsletter to hear about ebooks .,new,test
90695,newsletter to hear about new .,ebooks,test
90696,to hear about new ebooks,.,test
90697,hear about new ebooks .,,test


In [62]:
# 처리 내용 csv파일로 저장
cbow_data.to_csv(args.output_munged_csv, index=False)

# 데이터 벡터 변환 클래스

In [63]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm

In [64]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token,
                'mask_token': self._mask_token}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [65]:
class CBOWVectorizer(object):
    def __init__(self, cbow_vocab):
        self.cbow_vocab = cbow_vocab

    def vectorize(self, context, vector_length=-1):

        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, cbow_df):

        cbow_vocab = Vocabulary()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)

        return cls(cbow_vocab)

    @classmethod
    def from_serializable(cls, contents):
        cbow_vocab = \
            Vocabulary.from_serializable(contents['cbow_vocab'])
        return cls(cbow_vocab=cbow_vocab)

    def to_serializable(self):
        return {'cbow_vocab': self.cbow_vocab.to_serializable()}

In [66]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):

        self.cbow_df = cbow_df
        self._vectorizer = vectorizer

        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))

        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, cbow_csv):

        cbow_df = pd.read_csv(cbow_csv)
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, CBOWVectorizer.from_dataframe(train_cbow_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, cbow_csv, vectorizer_filepath):

        cbow_df = pd.read_csv(cbow_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(cbow_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):

        with open(vectorizer_filepath) as fp:
            return CBOWVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):

        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):

        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):

        return len(self) // batch_size


In [67]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [68]:
class CBOWClassifier(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):

        super(CBOWClassifier, self).__init__()

        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size,
                                       embedding_dim=embedding_size,
                                       padding_idx=padding_idx)
        self.fc1 = nn.Linear(in_features=embedding_size,
                             out_features=vocabulary_size)

    def forward(self, x_in, apply_softmax=False):

        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        y_out = self.fc1(x_embedded_sum)

        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out

In [69]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

In [70]:
def update_train_state(args, model, train_state):


    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False


    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]


        if loss_t >= train_state['early_stopping_best_val']:

            train_state['early_stopping_step'] += 1

        else:

            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])


            train_state['early_stopping_step'] = 0


        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

In [71]:
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [72]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [73]:
args = Namespace(

    cbow_csv="frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/cbow",

    embedding_size=50,

    seed=1337,
    num_epochs=100,
    learning_rate=0.0001,
    batch_size=32,
    early_stopping_criteria=5,

    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

In [74]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)

    print("파일 경로: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

파일 경로: 
	model_storage/cbow/vectorizer.json
	model_storage/cbow/model.pth


In [75]:

if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("CUDA 사용여부: {}".format(args.cuda))

set_seed_everywhere(args.seed, args.cuda)

handle_dirs(args.save_dir)

CUDA 사용여부: True


In [76]:
!mkdir data
!wget https://git.io/JtX5A -O data/download.py
!wget https://git.io/JtX5F -O data/get-all-data.sh
!chmod 755 data/get-all-data.sh
%cd data
!./get-all-data.sh
%cd ..

mkdir: cannot create directory ‘data’: File exists
--2024-10-10 06:38:44--  https://git.io/JtX5A
Resolving git.io (git.io)... 140.82.114.21
Connecting to git.io (git.io)|140.82.114.21|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_2_CBOW/data/download.py [following]
--2024-10-10 06:38:45--  https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_5/5_2_CBOW/data/download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1568 (1.5K) [text/plain]
Saving to: ‘data/download.py’


2024-10-10 06:38:46 (25.5 MB/s) - ‘data/download.py’ saved [1568/1568]

--2024-10-10 06:38:46--  https://git.io/JtX5F
Resolving git.io (git.io)..

In [77]:
if args.reload_from_files:
    print("데이터셋과 Vectorizer를 로드합니다")
    dataset = CBOWDataset.load_dataset_and_load_vectorizer(args.cbow_csv,
                                                           args.vectorizer_file)
else:
    print("데이터셋을 로드하고 Vectorizer를 만듭니다")
    dataset = CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()

classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab),
                            embedding_size=args.embedding_size)

데이터셋을 로드하고 Vectorizer를 만듭니다


In [78]:
classifier = classifier.to(args.device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)

epoch_bar = tqdm.notebook.tqdm(desc='training routine',
                               total=args.num_epochs,
                               position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                               total=dataset.get_num_batches(args.batch_size),
                               position=1,
                               leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                             total=dataset.get_num_batches(args.batch_size),
                             position=1,
                             leave=True)
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        dataset.set_split('train')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):

            optimizer.zero_grad()

            y_pred = classifier(x_in=batch_dict['x_data'])

            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            loss.backward()

            optimizer.step()

            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            train_bar.set_postfix(loss=running_loss, acc=running_acc,
                            epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)


        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            y_pred =  classifier(x_in=batch_dict['x_data'])

            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc,
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/1984 [00:00<?, ?it/s]

split=val:   0%|          | 0/425 [00:00<?, ?it/s]

In [79]:
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss()

dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                   batch_size=args.batch_size,
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    y_pred =  classifier(x_in=batch_dict['x_data'])

    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

  classifier.load_state_dict(torch.load(train_state['model_filename']))


In [80]:
print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 8.131600918489342;
테스트 정확도: 11.551470588235299


In [81]:
def pretty_print(results):
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):

    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))

    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

In [82]:
word = input('단어를 입력해 주세요: ')
embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx
pretty_print(get_closest(word, word_to_idx, embeddings, n=5))

단어를 입력해 주세요: frankenstein
...[6.93] - discrimination
...[6.99] - slight
...[7.02] - oppressive
...[7.05] - spurned
...[7.11] - illustrate
...[7.11] - wandering


In [83]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words:
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[6.93] - discrimination
...[6.99] - slight
...[7.02] - oppressive
...[7.05] - spurned
...[7.11] - illustrate
...[7.11] - wandering
...[7.32] - cares
...[7.58] - griefs
...[7.63] - sickness
...[7.66] - trifling
...[7.69] - saw
...[7.70] - prolong
...[6.85] - mutual
...[6.93] - mist
...[6.95] - swelling
...[7.01] - impression
...[7.06] - darkened
...[7.06] - nearly
...[6.37] - while
...[6.45] - foundations
...[6.61] - awoke
...[6.65] - consoles
...[6.70] - literally
...[6.74] - depend
...[6.74] - unveiled
...[6.88] - moonlight
...[7.05] - ought
...[7.08] - bed
...[7.14] - superhuman
...[7.14] - therefore
...[6.25] - bottom
...[6.42] - injury
...[6.49] - chivalry
...[6.50] - altered
...[6.51] - penetrated
...[6.54] - danger


In [84]:
v = ["duct", "tape", "work", "anywhere", "magic", "worship"]
print(len(v))
print(v[0])

6
duct


In [85]:
np.random.seed(42)
input_array_tape=np.array([0,1,0,0,0,0]) #"tape"
input_weight_matrix = np.random.random_sample((6,3))
print(input_weight_matrix)

[[0.37454012 0.95071431 0.73199394]
 [0.59865848 0.15601864 0.15599452]
 [0.05808361 0.86617615 0.60111501]
 [0.70807258 0.02058449 0.96990985]
 [0.83244264 0.21233911 0.18182497]
 [0.18340451 0.30424224 0.52475643]]


In [86]:
projection = np.dot(input_array_tape,input_weight_matrix)
print(projection)

[0.59865848 0.15601864 0.15599452]


In [87]:
output_weight_matrix = np.random.random_sample((3,6))
print(output_weight_matrix)

[[0.43194502 0.29122914 0.61185289 0.13949386 0.29214465 0.36636184]
 [0.45606998 0.78517596 0.19967378 0.51423444 0.59241457 0.04645041]
 [0.60754485 0.17052412 0.06505159 0.94888554 0.96563203 0.80839735]]


In [88]:
output_array_for_input_tape_and_orange_output_context = np.dot(projection, output_weight_matrix)
print(output_array_for_input_tape_and_orange_output_context)

[0.42451664 0.32344971 0.40759145 0.31176029 0.41795589 0.35267831]


In [98]:
import builtins
print(builtins.list(zip(v, output_array_for_input_tape_and_orange_output_context)))

[('duct', 0.42451663675598933), ('tape', 0.32344971050993737), ('work', 0.4075914505752598), ('anywhere', 0.3117602853605092), ('magic', 0.41795589389125587), ('worship', 0.35267831257488347)]


#### 가장 높은 추론값을 가지는 것
- duct, 0.42451663675598933
- magic, 0.41795589389125587
- work, 0.4075914505752598
- worship, 0.35267831257488347