### 5.1 임베딩 배우는 이유

In [108]:
import torch
import torch.nn as nn
from tqdm import tqdm
from annoy import AnnoyIndex
import numpy as np

In [115]:
# 사전 훈련된 단어 임베딩
class PreTrainedEmbeddings(object) : # 임베딩 효율적으로 로드하고 처리
    def __init__(self, word_to_index, word_vectors) :
        '''
        word_to_index : dict, 단어에서 정수로 매핑
        word_vectors : numpy 배열의 리스트
        '''
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v:k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(len(word_vectors[0]), 'euclidean') # vector_size
        
        for _, i in self.word_to_index.items() :
            self.index.add_item(i, self.word_vectors[i])
        
        self.index.build(50)
        
    @classmethod
    def from_embeddings_file(cls, embedding_file) : #사전 훈련된 벡터파일에서 객체 생성
        word_to_index = {}
        word_vectors = []
        with open(embedding_file, 'rt', encoding='UTF8') as fp :
            for line in fp.readlines() :
                line = line.split(' ')
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])
                
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
        
        return cls(word_to_index, word_vectors)
    
    # 단어 임베딩 사용한 유추 작업
    
    def get_embedding(self, word) :
        return self.word_vectors[self.word_to_index[word]]
    
    # 벡터 주어지면 n개의 최근접 이웃 반환
    def get_closest_to_vector(self, vector, n=1) :
        # vector는 annoy 인덱스에 있는 벡터의 크기와 같아야
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    # 단어 임베딩 사용한 유추 결과 출력
    def compute_and_print_analogy(self, w1, w2, w3) :
        # word1이 word2일 때, word3은 __word4__이다.
        v1 = self.get_embedding(w1)
        v2 = self.get_embedding(w2)
        v3 = self.get_embedding(w3)
        
        spatial_relationship = v2 - v1
        v4 = v3 + spatial_relationship
        
        closest_words = self.get_closest_to_vector(v4, n=4)
        existing_words = set([w1, w2, w3])
        closest_words = [word for word in closest_words
                        if word not in existing_words]
        
        if len(closest_words) == 0 :
            print('계산된 벡터와 가장 가까운 이웃 찾을 수 없다.')
            return
        
        for w4 in closest_words :
            print("{} : {} :: {} : {}".format(w1, w2, w3, w4))

In [116]:
# Glove 단어 임베딩 사용
embeddings = PreTrainedEmbeddings.from_embeddings_file('./Data/glove.6B.100d.txt')

In [117]:
embeddings.compute_and_print_analogy('man', 'he', 'woman')
print('-'*20)
embeddings.compute_and_print_analogy('fly', 'plane', 'sail')
print('-'*20)
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')
print('-'*20)
embeddings.compute_and_print_analogy('blue', 'color', 'dog')
print('-'*20)
embeddings.compute_and_print_analogy('leg', 'legs', 'hand')
print('-'*20)
embeddings.compute_and_print_analogy('toe', 'foot', 'finger')
print('-'*20)
embeddings.compute_and_print_analogy('talk', 'communicate', 'read')
print('-'*20)
embeddings.compute_and_print_analogy('blue', 'democrat', 'red')
print('-'*20)
embeddings.compute_and_print_analogy('man', 'king', 'woman')
print('-'*20)
embeddings.compute_and_print_analogy('man', 'doctor', 'woman')
print('-'*20)
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

man : he :: woman : she
man : he :: woman : her
man : he :: woman : having
--------------------
fly : plane :: sail : ship
fly : plane :: sail : vessel
--------------------
cat : kitten :: dog : puppy
cat : kitten :: dog : rottweiler
cat : kitten :: dog : puppies
cat : kitten :: dog : pooch
--------------------
blue : color :: dog : animal
blue : color :: dog : breed
--------------------
leg : legs :: hand : hands
leg : legs :: hand : stick
leg : legs :: hand : eyes
--------------------
toe : foot :: finger : turning
toe : foot :: finger : moving
toe : foot :: finger : attached
--------------------
talk : communicate :: read : interpret
talk : communicate :: read : memorize
talk : communicate :: read : typed
--------------------
blue : democrat :: red : republican
blue : democrat :: red : congressman
blue : democrat :: red : senator
--------------------
man : king :: woman : queen
man : king :: woman : throne
man : king :: woman : elizabeth
--------------------
man : doctor :: woman : 

### 5.2 예제 : CBOW 임베딩 학습하기
* 다중 분류 작업
* 단어 텍스트 스캔해 단어의 문맥 윈도를 만든 후 문맥 윈도에서 **중앙의 단어 제거**하고 문맥 윈도 사용해 **누락된 단어 예측**

In [None]:
# 프랑켄슈타인 데이터셋

In [37]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm import tqdm_notebook

from collections import Counter
import json
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [23]:
args = Namespace(
    raw_dataset_txt="./Data/frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="./Data/frankenstein_with_splits.csv",
    seed=1337
)

In [24]:
# 데이터 전처리
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [26]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

print (len(sentences), "sentences")
print ("Sample:", sentences[100])

3427 sentences
Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [30]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
cleaned_sentences[100]

'no incidents have hitherto befallen us that would make a figure in a letter . '

In [31]:
# 윈도 생성
MASK_TOKEN = "<MASK>"

flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
    [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(cleaned_sentences)])

'''
만약, args.window_size = 2 이고, 문장이 hello world 이면,
<MASK> <MASK> hello world <MASK> <MASK>
윈도우 크기는 = 2 * 2 + 1 = 5

'''

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sentence in tqdm_notebook(cleaned_sentences)])


  0%|          | 0/3427 [00:00<?, ?it/s]

'\n만약, args.window_size = 2 이고, 문장이 hello world 이면,\n<MASK> <MASK> hello world <MASK> <MASK>\n윈도우 크기는 = 2 * 2 + 1 = 5\n\n'

In [32]:
# Create cbow data
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for window in tqdm_notebook(windows):


  0%|          | 0/90698 [00:00<?, ?it/s]

In [33]:
cbow_data = pd.DataFrame(data, columns=["context", "target"])
cbow_data

Unnamed: 0,context,target
0,", or the modern prometheus",frankenstein
1,frankenstein or the modern prometheus by,","
2,"frankenstein , the modern prometheus by mary",or
3,"frankenstein , or modern prometheus by mary wo...",the
4,"frankenstein , or the prometheus by mary wolls...",modern
...,...,...
90693,our email newsletter to hear new ebooks .,about
90694,email newsletter to hear about ebooks .,new
90695,newsletter to hear about new .,ebooks
90696,to hear about new ebooks,.


In [34]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [35]:
cbow_data

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train
...,...,...,...
90693,our email newsletter to hear new ebooks .,about,test
90694,email newsletter to hear about ebooks .,new,test
90695,newsletter to hear about new .,ebooks,test
90696,to hear about new ebooks,.,test


In [36]:
#cbow_data.to_csv(args.output_munged_csv, index=False)

In [38]:
class Vocabulary(object) :

    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token  # 모델 파라미터 업데이트하는데 사용하지 않는 위치
        
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token, 
                'mask_token': self._mask_token}

    
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    
    def add_token(self, token):        
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
        
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [39]:
class CBOWVectorizer(object) :
    def __init__(self, cbow_vocab):
        self.cbow_vocab = cbow_vocab
    
    # 문맥의 토큰 수가 최대 길이보다 작으면 0으로 채움 -> 패딩
    def vectorize(self, context, vector_length=-1):
        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector
    
    @classmethod
    def from_dataframe(cls, cbow_df):
        cbow_vocab = Vocabulary()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

    @classmethod
    def from_serializable(cls, contents):
        cbow_vocab = \
            Vocabulary.from_serializable(contents['cbow_vocab'])
        return cls(cbow_vocab=cbow_vocab)

    def to_serializable(self):
        return {'cbow_vocab': self.cbow_vocab.to_serializable()}

In [40]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, cbow_csv):
        cbow_df = pd.read_csv(cbow_csv)
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, CBOWVectorizer.from_dataframe(train_cbow_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, cbow_csv, vectorizer_filepath):
        cbow_df = pd.read_csv(cbow_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(cbow_df, vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return CBOWVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer
        
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size
    
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [41]:
# 모델
class CBOWClassifier(nn.Module): 
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):
        super(CBOWClassifier, self).__init__()
        
        # 문맥의 단어를 나타내는 인덱스를 각 단어의 벡터로 변환
        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size, 
                                       embedding_dim=embedding_size,
                                       padding_idx=padding_idx)
        # 출력 : (batch_size, context_size, embedding_size)
        self.fc1 = nn.Linear(in_features=embedding_size,
                             out_features=vocabulary_size)

    def forward(self, x_in, apply_softmax=False):
        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        # dim=1 : context_size(각 문맥의 단어들)에서 임베딩 벡터 합침
        # 출력 (batch_size, embedding_size)
        y_out = self.fc1(x_embedded_sum)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        return y_out

In [42]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    # 적어도 한 번 모델을 저장합니다
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # 성능이 향상되면 모델을 저장합니다
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # 손실이 나빠지면
        if loss_t >= train_state['early_stopping_best_val']:
            # 조기 종료 단계 업데이트
            train_state['early_stopping_step'] += 1
        # 손실이 감소하면
        else:
            # 최상의 모델 저장
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # 조기 종료 단계 재설정
            train_state['early_stopping_step'] = 0

        # 조기 종료 여부 확인
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [43]:
args = Namespace(
    # 날짜와 경로 정보
    cbow_csv="./Data/frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model2.pth",
    save_dir="./",
    # 모델 하이퍼파라미터
    embedding_size=50,
    # 훈련 하이퍼파라미터
    seed=1337,
    num_epochs=100,
    learning_rate=0.0001,
    batch_size=32,
    early_stopping_criteria=5,
    # 실행 옵션
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

dataset = CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
vectorizer = dataset.get_vectorizer()

In [44]:
classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab), 
                            embedding_size=args.embedding_size)

In [46]:
classifier

CBOWClassifier(
  (embedding): Embedding(6138, 50, padding_idx=0)
  (fc1): Linear(in_features=50, out_features=6138, bias=True)
)

In [50]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)

In [52]:
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       #device=args.device
                                      )
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()

        y_pred = classifier(x_in=batch_dict['x_data'])

        loss = loss_func(y_pred, batch_dict['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        loss.backward()

        optimizer.step()
        
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)


    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)

    
    # 검증 세트에 대한 순회
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       #device=args.device
                                      )
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):

        y_pred =  classifier(x_in=batch_dict['x_data'])

        loss = loss_func(y_pred, batch_dict['y_target'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

    train_state = update_train_state(args=args, model=classifier,
                                     train_state=train_state)

    scheduler.step(train_state['val_loss'][-1])

    if train_state['stop_early']:
        break

In [53]:
classifier.load_state_dict(torch.load(train_state['model_filename']))
loss_func = nn.CrossEntropyLoss()

In [56]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   #device=args.device
                                  )
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred =  classifier(x_in=batch_dict['x_data'])
    
    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [57]:
print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

# 형편없는 정확도 -> 모델 단순 + 적은 데이터셋

테스트 손실: 8.225977621639476;
테스트 정확도: 12.124999999999996


In [59]:
# 임베딩 결과 출력
def pretty_print(results):
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    # 다른 모든 단어까지 거리를 계산합니다
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

In [60]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[6.98] - ceased
...[6.99] - situated
...[6.99] - periodically
...[7.05] - without
...[7.07] - divine
...[7.10] - seven
...[6.51] - irresistible
...[6.61] - dark
...[6.65] - variety
...[6.66] - spoken
...[6.82] - wise
...[6.92] - demonstrate
...[7.19] - should
...[7.41] - colours
...[7.58] - temptation
...[7.59] - warring
...[7.67] - hides
...[7.72] - provided
...[6.43] - torrents
...[6.55] - which
...[6.68] - evidently
...[6.74] - furtherance
...[6.75] - rejoiced
...[6.76] - shrieked
...[7.18] - omnipotent
...[7.24] - expedition
...[7.28] - gesture
...[7.28] - hides
...[7.29] - storm
...[7.30] - decreasing
...[6.88] - agitated
...[6.91] - hero
...[6.92] - comprehensive
...[6.94] - event
...[7.02] - conceited
...[7.03] - creaking


### 5.3 예제 :문서 분류에 사전 훈련된 임베딩을 사용한 전이 학습

In [61]:
args = Namespace(
    raw_dataset_csv="./Data/news.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="./Data/news_with_splits.csv",
    seed=1337
)

In [62]:
news = pd.read_csv(args.raw_dataset_csv, header=0)
news

Unnamed: 0,category,title
0,Business,Wall St. Bears Claw Back Into the Black (Reuters)
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters)
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...
4,Business,"Oil prices soar to all-time record, posing new..."
...,...,...
119995,World,Pakistan's Musharraf Says Won't Quit as Army C...
119996,Sports,Renteria signing a top-shelf deal
119997,Sports,Saban not going to Dolphins yet
119998,Sports,Today's NFL games


In [65]:
by_category = collections.defaultdict(list)
for _, row in news.iterrows():
    by_category[row.category].append(row.to_dict()) # category : title

In [76]:
for category in by_category:
        print ("{0}: {1}".format(category, len(by_category[category])))

Business: 30000
Sci/Tech: 30000
Sports: 30000
World: 30000
0: 0
1: 0
100: 0


In [79]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in by_category.items():
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train] :
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val] :
        item['split'] = 'val'
    for item in item_list[n_train+n_val:] :
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [80]:
final_news = pd.DataFrame(final_list)
final_news

Unnamed: 0,category,title,split
0,Business,"Jobs, tax cuts key issues for Bush",train
1,Business,Jarden Buying Mr. Coffee #39;s Maker,train
2,Business,Retail sales show festive fervour,train
3,Business,Intervoice's Customers Come Calling,train
4,Business,Boeing Expects Air Force Contract,train
...,...,...,...
119995,World,Genesis Space Capsule Crashes Into Desert,test
119996,World,U.S.: Too Early to Tell Iraq Unit's Fate,test
119997,World,AFGHAN OPIUM GROWING UP TWO THIRDS,test
119998,World,At least one Saudi policeman killed in clashes...,test


In [81]:
final_news.split.value_counts()

split
train    84000
val      18000
test     18000
Name: count, dtype: int64

In [82]:
# Preprocess the reviews
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_news.title = final_news.title.apply(preprocess_text)

final_news

Unnamed: 0,category,title,split
0,Business,"jobs , tax cuts key issues for bush",train
1,Business,jarden buying mr . coffee s maker,train
2,Business,retail sales show festive fervour,train
3,Business,intervoice s customers come calling,train
4,Business,boeing expects air force contract,train
...,...,...,...
119995,World,genesis space capsule crashes into desert,test
119996,World,u . s . too early to tell iraq unit s fate,test
119997,World,afghan opium growing up two thirds,test
119998,World,at least one saudi policeman killed in clashes...,test


In [83]:
#final_news.to_csv(args.output_munged_csv, index=False)

In [84]:
# Vocabulary 클래스 상속
# 단어를 정수 시퀀스에 매핑
class SequenceVocabulary(Vocabulary) :
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token  # 임베딩층의 마스킹 역할, 가변 길이의 시퀀스의 손실 계산
        self._unk_token = unk_token  # 드물게 등장하는 단어 학습
        self._begin_seq_token = begin_seq_token  # 시퀀스 경계에 관한 힌트
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

In [92]:
# Vectorizer
class NewsVectorizer(object) :
    def __init__(self, title_vocab, category_vocab) :
        self.title_vocab = title_vocab
        self.category_vocab = category_vocab
        
    def vectorize(self, title, vector_length = -1) :
        indices = [self.title_vocab.begin_seq_index]
        indices.extend(self.title_vocab.lookup_token(token)
                      for token in title.split(' '))
        indices.append(self.title_vocab.end_seq_index)
        
        if vector_length < 0 :
            vector_length = len(indices)
            
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices) :] = self.title_vocab.mask_index
        
        return out_vector
    
    @classmethod
    def from_dataframe(cls, news_df, cutoff=25) :
        category_vocab = Vocabulary()
        for category in sorted(set(news_df.category)) :
            category_vocab.add_token(category)
            
        word_counts = Counter()
        for title in news_df.title :
            for token in title.split(' ') :
                if token not in string.punctuation :
                    word_counts[token] += 1
                    
        title_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items() :
            if word_count >= cutoff :
                title_vocab.add_token(word)
                
        return cls(title_vocab, category_vocab)
    
    @classmethod
    def from_serializable(cls, contents) :
        title_vocab=SequenceVocabulary.from_serializable(contents['title_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['category_vocab'])
        
        return cls(title_vocab=title_vocab, category_vocab=category_vocab)
    
    def to_serializable(self) :
        return {'title_vocab': self.title_vocab.to_serializable(),
                'category_vocab': self.category_vocab.to_serializable()}

In [86]:
# Dataset
class NewsDataset(Dataset) :
    def __init__(self, news_df, vectorizer) :
        self.news_df = news_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, news_df.title)) + 2
        # begin_seq만 사용하면 +1, end_seq까지 같이 사용하면 +2
        
        self.train_df = self.news_df[self.news_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.news_df[self.news_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
        
        # 클래스 가중치
        class_counts = news_df.category.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        news_df = pd.read_csv(news_csv)
        train_news_df = news_df[news_df.split=='train']
        return cls(news_df, NewsVectorizer.from_dataframe(train_news_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        news_df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        title_vector = \
            self._vectorizer.vectorize(row.title, self._max_seq_length)

        category_index = \
            self._vectorizer.category_vocab.lookup_token(row.category)

        return {'x_data': title_vector,
                'y_target': category_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [87]:
# 모델
class NewsClassifier(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_channels, 
                 hidden_dim, num_classes, dropout_p, 
                 pretrained_embeddings=None, padding_idx=0) :
        super(NewsClassifier, self).__init__()
        
        if pretrained_embeddings is None :
            self.emb = nn.Embedding(embedding_dim = embedding_size,
                                   num_embeddings = num_embeddings,
                                   padding_idx = padding_idx)
            
        else :
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim = embedding_size,
                                   num_embeddings = num_embeddings,
                                   padding_idx = padding_idx,
                                   _weight = pretrained_embeddings)
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
    
    def forward(self, x_in, apply_softmax=False) :
        x_embedded = self.emb(x_in).permute(0, 2, 1) # 특성과 채널 차원 변경
        # (batch_size, sequence_length, embedding_size) -> (batch_size, emb_size, sequence_length)
        # Conv1D레이어는 채널이 두 번째 차원에 있어야 함
        features = self.convnet(x_embedded)
        
        # 평균 값 계산해 부가적인 차원 제거
        remaining_size = features.size(dim=2) # features의 세번째 차원 sequence_length 크기 반환
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2) # 평균 풀링의 커널 크기 = remaining_size
        # 평균 풀링 후 출력 (batch_size, num_channels, 1) -> squeeze(dim=2)
        # -> (batch_size, num_channels)
        features = F.dropout(features, p=self._dropout_p)
        
        # MLP 분류기
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [99]:
# 어휘 사전에 기반하여 단어 임베딩 부분 집합 선택
def load_glove_from_file(glove_filepath) :
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, 'rt', encoding='UTF8') as fp :
        for index, line in enumerate(fp) :
            line = line.split(' ')
            word_to_index[line[0]] = index # word = line[0] # 첫번째 요소는 단어
            embedding_i = np.array([float(val) for val in line[1:]]) # 나머지 요소는 임베딩 벡터
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

# 특정 단어 집합에 대한 임베딩 행렬 생성
def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            # 데이터셋에는 있지만 사전 훈련된 GloVe 임베딩에 없는 단어 등장하면 문제 발생
            # Xavier 균등 분포로 초기화
            final_embeddings[i, :] = embedding_i

    return final_embeddings

In [89]:
args = Namespace(
    # 날짜와 경로 정보
    news_csv="./Data/news_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model3.pth",
    save_dir="./",
    # 모델 하이퍼파라미터
    glove_filepath='./Data/glove.6B.100d.txt', 
    use_glove=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    # 훈련 하이퍼파라미터
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=128, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    # 실행 옵션
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
) 

In [90]:
args.use_glove = True

In [93]:
dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
vectorizer = dataset.get_vectorizer()

In [100]:
# GloVe를 사용하거나 랜덤하게 임베딩을 초기화합니다
if args.use_glove:
    words = vectorizer.title_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("사전 훈련된 임베딩을 사용합니다")
else:
    print("사전 훈련된 임베딩을 사용하지 않습니다")
    embeddings = None

classifier = NewsClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.title_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.category_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

사전 훈련된 임베딩을 사용합니다


In [101]:
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

In [105]:
for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # 훈련 세트에 대한 순회
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           #device=args.device
                                          )
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            
            optimizer.zero_grad()

            # 단계 2. 출력을 계산합니다
            y_pred = classifier(batch_dict['x_data'])

            # 단계 3. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
            loss.backward()

            # 단계 5. 옵티마이저로 가중치를 업데이트합니다
            optimizer.step()
            
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 검증 세트에 대한 순회

        # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           #device=args.device
                                          )
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred =  classifier(batch_dict['x_data'])

            # 단계 2. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 3. 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

RuntimeError: weight tensor should be defined either for all 6 classes or no classes but got weight tensor of shape: [4]

* cuda 설치 못함 이슈로 실행 불가 ..

In [None]:
classifier.load_state_dict(torch.load(train_state['model_filename']))

#classifier = classifier.to(args.device)
#dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   #device=args.device
                                  )
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred =  classifier(batch_dict['x_data'])
    
    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [None]:
print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

#테스트 손실: 0.6171224400401114
#테스트 정확도: 79.64843749999999

In [106]:
# 새로운 뉴스 제목의 카테고리 예측하기
def predict_category(title, classifier, vectorizer, max_length) :
    title = preprocess_text(title)
    vectorized_title = \
        torch.tensor(vectorizer.vectorize(title, vector_length=max_length))
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_category = vectorizer.category_vocab.lookup_index(indices.item())

    return {'category': predicted_category, 
            'probability': probability_values.item()}

In [107]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.category.unique():
        samples[cat] = dataset.val_df.title[dataset.val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()

In [None]:
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_category(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        print("예측: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + 샘플: {}".format(sample))
    print("-"*30 + "\n")