In [53]:
import torch.nn as nn
import torch
import torch.nn.utils.rnn as rnn
import statistics
import nltk # 없으시면 설치하세요: pip install nltk
import random
import collections
import time

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [55]:
## Dictionary class 선언
class Dictionary(object):
    def __init__(self, dataset, size):
        ## init vocab ##
        ##<StartOfSentence>
        
        self.word2idx = {'<pad>':0, '<sos>': 1, '<eos>': 2, '<unk>': 3} # 사전 : 사전에 등록되지 않은 단어는 다 unk
        self.idx2word = ['<pad>', '<sos>', '<eos>', '<unk>'] # inverted dictionary 역사전 인덱스를 주면 단어가 나옴
        # self.word2idx['<unk>'] --> 3
        # self.idx2word[3] = '<unk>'
        
        self.build_dict(dataset, size)
    
    def __call__(self, word):
        return self.word2idx.get(word, self.word2idx['<unk>']) # if word does not exist in vocab then return unk idx

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def build_dict(self, dataset, dict_size):
        ## Practice ##
        """Tokenize a text file."""
        total_words = (word for sent in dataset for word in sent) # store all words into tuple
        word_freq = collections.Counter(total_words)# count the number of each word: ex) ('The': 10000, 'a': 5555, ...)
                                                        #정렬 조건 (횟수,순서)
        vocab = sorted(word_freq.keys(), key=lambda word: (word_freq[word], word), reverse=True) # sort by frequency 내림차순 별로 않나온 것은 학습 제외
        vocab = vocab[:dict_size] # truncate 사전 크기로 자름
        
        for word in vocab:
            self.add_word(word)

    def __len__(self):
        return len(self.idx2word)


In [56]:
## Brown dataset Preprocessing (NLTK)
def brown_dataset(min=5, max=30):
    nltk.download('brown')

    # get sentences with the length between min and max
    # convert all words into lower-case
    # Preprocessing : 다 소문자로 처리, 리스트
    all_seq = [[token.lower() for token in seq] for seq in nltk.corpus.brown.sents() 
               if min <= len(seq) <= max]

    random.shuffle(all_seq) # shuffle
    return all_seq

In [57]:
## Download Brown dataset
dataset = brown_dataset()
print(len(dataset))
## print some part
print(dataset[0])
print(dataset[1])
print(dataset[2])

[nltk_data] Downloading package brown to /home/piai/nltk_data...
[nltk_data]   Package brown is already up-to-date!


43450
['by', 'god', ',', "they're", 'a-coming', ',', 'they', 'are', "''", '!', '!']
['plans', 'are', 'on', 'pages', '22', 'and', '23', '.']
['just', 'one', 'or', 'two', 'swallows', ',', 'he', 'told', 'himself', ',', 'enough', 'to', 'lessen', 'some', 'of', 'the', 'pain', '.']


In [58]:
#입력으 배치사이즈 만큼자르고
#패딩, 심볼 추가
#이런걸 다 해주는게 Corpus

## Data handler class 선언
class Corpus(object):
    def __init__(self, dataset, device, dict_size=20000, train_ratio=0.97):
        train_size = int(len(dataset) * train_ratio) 
        self.device = device
        self.dictionary = Dictionary(dataset, dict_size)#######################Corpus낸 Dict
        self.train = dataset[:train_size] # [0 ~ train_size] 학습데이터
        self.valid = dataset[train_size:] # [train_size: len(dataset)] #테스트데이터

    def indexing(self, dat): #하나 이상의 문장이 input으로 들어왔을 때, 심볼 넣어는 것. 
        # dat = list(list) //[[],[]] 배치사이즈별로 리스트 내 리스트 형식으로 구성
        src_idxes = [] # 모델 입력
        tgt_idxes = [] # 모델 정답
        for sent in dat:
            #Add <sos>
            src_idx = [self.dictionary('<sos>')] + [self.dictionary(word) for word in sent] #<sos>의 인덱스가 반환됨
            
            #Add <eos>
            tgt_idx = [self.dictionary(word) for word in sent] + [self.dictionary('<eos>')] 
            
            #Batch를 위해
            src_idxes.append(torch.tensor(src_idx).type(torch.int64))
            tgt_idxes.append(torch.tensor(tgt_idx).type(torch.int64))
    
        
        # shape = [B, L]
        #Add <apd>
        src_idxes = rnn.pad_sequence(src_idxes, batch_first=True).to(self.device) #nn라이브러리 자도 패딩
        
        # flatten shape = [B * L]
        tgt_idxes = rnn.pad_sequence(tgt_idxes, batch_first=True).to(self.device).view(-1) #target을 flatten [batch *length]

        return src_idxes, tgt_idxes

    def batch_iter(self, batch_size, isTrain=True):#배치사이즈에 받게 이차원 텐서로 쪼갠다음 인덱싱을 해줌
        dat = self.train if isTrain else self.valid
        if isTrain:
            random.shuffle(dat)

        for i in range(len(dat) // batch_size): #BATCH크기 만큼 자름
            batch = dat[i * batch_size: (i+1) * batch_size]
            src, tgt = self.indexing(batch)#인덱싱에 넣음
            yield {'src': src, 'tgt': tgt} #-> 인덱스 형태의 텐서가 반환됨

In [59]:
corpus = Corpus(dataset, device)

In [76]:
# Dictionary 확인
for i, (key, val) in enumerate(corpus.dictionary.word2idx.items()):
    print('word:  {:10s} | index: {:5d} '.format(key, val))
    if i == 20:
        break

word:  <pad>      | index:     0 
word:  <sos>      | index:     1 
word:  <eos>      | index:     2 
word:  <unk>      | index:     3 
word:  the        | index:     4 
word:  .          | index:     5 
word:  ,          | index:     6 
word:  of         | index:     7 
word:  and        | index:     8 
word:  to         | index:     9 
word:  a          | index:    10 
word:  in         | index:    11 
word:  was        | index:    12 
word:  he         | index:    13 
word:  is         | index:    14 
word:  ''         | index:    15 
word:  ``         | index:    16 
word:  it         | index:    17 
word:  that       | index:    18 
word:  for        | index:    19 
word:  ;          | index:    20 


### ㄴ관사, 조사가 빈번하게 나옴

In [61]:
## indexing 함수 결과 확인

# case : 단일 문장 입력 시. 
sent = [dataset[1]]#리스트 안에 리스트로 입력
idx_src, idx_tgt = corpus.indexing(sent)


print(sent)
print(idx_src) # <SOS> index로 시작
print(idx_tgt) # <EOS> index로 종료

print('-' * 90)
## case : 복수 문장 입력 시 (batching)
batch = [dataset[0], dataset[1]]
idx_src, idx_tgt = corpus.indexing(batch)

print(batch)
print(idx_src) # 가장 길이가 긴 문장 (dataset[0]) 보다 짧은 문장 (dataset[1]) 의 경우 남는 길이만큼 padding=0 삽입 확인.
print(idx_tgt)

#결과가 shift되서 그대로 오고 끝에 <eos>붙어서 나타남

[['plans', 'are', 'on', 'pages', '22', 'and', '23', '.']]
tensor([[   1,  989,   31,   24, 3750, 2635,    8, 2735,    5]],
       device='cuda:0')
tensor([ 989,   31,   24, 3750, 2635,    8, 2735,    5,    2], device='cuda:0')
------------------------------------------------------------------------------------------
[['by', 'god', ',', "they're", 'a-coming', ',', 'they', 'are', "''", '!', '!'], ['plans', 'are', 'on', 'pages', '22', 'and', '23', '.']]
tensor([[   1,   33,  282,    6, 1248,    3,    6,   38,   31,   15,   62,   62],
        [   1,  989,   31,   24, 3750, 2635,    8, 2735,    5,    0,    0,    0]],
       device='cuda:0')
tensor([  33,  282,    6, 1248,    3,    6,   38,   31,   15,   62,   62,    2,
         989,   31,   24, 3750, 2635,    8, 2735,    5,    2,    0,    0,    0],
       device='cuda:0')


### ㄴtarget은 flatten이 됬기떄문에 연속해서 하나의 리스트로 출력됨

# RNN Model
![모델구조](fig/LM_model.png)

In [62]:
## RNN Language model 선언

# Define network
class RNNModel(nn.Module):
    #학습할 레이어, 파라메터 정의
    def __init__(self, ntoken, hidden_size, nlayers, dropout=0.1):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.embeddings = nn.Embedding(ntoken, hidden_size) # Practice // 벡터와 바꿔주는 (행:단어의 갯수, 열:dimension)
        self.rnn = nn.LSTM(hidden_size, hidden_size, nlayers, dropout=dropout, batch_first=True) # Practice [batch, batch_first]
        #input_size: word vector가 입력으로 들어감 그러므로 input size가 hidden size
        
        
        self.output_layer = nn.Linear(hidden_size, ntoken) # Practice  // 사전의 크기로 사이즈 조절
        self.sm = nn.LogSoftmax(dim=-1) # log확률값

        self.ntoken = ntoken #사전의 사이즈
        self.hidden_size = hidden_size 
        self.nlayers = nlayers

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.output_layer.weight.data.uniform_(-initrange, initrange)
        self.output_layer.bias.data.zero_()
    
    # init에 정의한 param가지고 실제 network의 연산이 정의된 부분
    def forward(self, input, hidden):
        # shape(input) = [Batch, length]
        emb = self.embeddings(input) # input -> embedding = (batch, length, dim)
        output, hidden = self.rnn(emb) # output = (batch. length. dim)
        output = self.drop(output) #dropout
        output = self.output_layer(output)# linear projection : hidden dim --> vocab size
        output = output.view(-1,self.ntoken)# output = (batch * length, vocab_size)
        output = self.sm(output)# softmax

        return output, hidden

    def init_hidden(self, bsz): #zoro 벡터를 만들기 위함.....?
        weight = next(self.parameters()) # to set init tensor with the same torch.dtype and torch.device
        return (weight.new_zeros(self.nlayers, bsz, self.hidden_size),
                weight.new_zeros(self.nlayers, bsz, self.hidden_size))


In [63]:
# Hyperparameters
batch_size = 60
hidden_size = 256 #embd 사이즈
dropout = 0.2
max_epoch = 30

# build model
ntokens = len(corpus.dictionary)
model = RNNModel(ntokens, hidden_size, 1, dropout).to(device)

#isTrain=True # Flag variable
isTrain=False # 이미 학습한 모델을 단순히 load만 수행하고 학습 X

# set loss func and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.NLLLoss(ignore_index=0, reduction='mean')

In [64]:
##### Training / Evaluation Parts #######

In [65]:
# accuracy
def cal_acc(scores, target):
    pred = scores.max(-1)[1]
    non_pad = target.ne(0)
    num_correct = pred.eq(target).masked_select(non_pad).sum().item() 
    num_non_pad = non_pad.sum().item()
    return 100 * (num_correct / num_non_pad)

# Training
![학습](fig/LM_train.png)

In [66]:
# train func.
def train():
    model.train() # Turn on training mode which enables dropout.
    mean_loss = []
    mean_acc = []
    start_time = time.time()

    for batch in corpus.batch_iter(batch_size):
        hidden = model.init_hidden(batch_size) # zero vectors for init hidden
        target = batch['tgt'] # flattened target 
        optimizer.zero_grad()
        
        #model forward가 실행됨
        output, hidden = model(batch['src'], hidden) # output = flatten output = [Batch_size * Length, vocab_size]

        # output shape = (batch * length, vocab_size)
        # target shape = (batch * length)   --> (batch * length, vocab_size) 로 one-hot distribtuion으로 내부적으로 변환되어 비교 수행
        loss = criterion(output, target) # compare between vocab_prob and answer_prob(one-hot converted)
        loss.backward()
        optimizer.step()

        mean_loss.append(loss.item())
        mean_acc.append(cal_acc(output, target))

    total_time = time.time() - start_time
    mean_acc = statistics.mean(mean_acc)
    mean_loss = statistics.mean(mean_loss)

    return mean_loss, total_time, mean_acc

In [67]:
# evaluation func.
def evaluate():
    model.eval() # Turn off dropout
    mean_loss = []
    mean_acc = []

    for batch in corpus.batch_iter(batch_size, isTrain=False):
        with torch.no_grad():
            hidden = model.init_hidden(batch_size)
            target = batch['tgt']
            output, hidden = model(batch['src'], hidden)
            loss = criterion(output, target)
            mean_loss.append(loss.item())
            mean_acc.append(cal_acc(output, target))

    mean_acc = statistics.mean(mean_acc)
    mean_loss = statistics.mean(mean_loss)

    return mean_loss, mean_acc

In [68]:
if isTrain: # set False if you don't need to train model
    start_time = time.time()

    for epoch in range(1, max_epoch+1):
        loss, epoch_time, accuracy = train()
        print('epoch {:4d} | times {:3.3f} |  loss: {:3.3f} | accuracy: {:3.2f}'.format(epoch+1, epoch_time, loss, accuracy))

        if epoch % 10 == 0:
            loss, accuracy = evaluate()
            print('=' * 60)
            print('Evaluation | loss: {:3.3f} | accuracy: {:3.2f}'.format(loss, accuracy))
            print('=' * 60)

    with open('model.pt', 'wb') as f:
        print('save model at: ./model.pt')
        torch.save(model, f)

# 과제1: 문장 확률 계산
![문장확률](fig/sent_prob_1.png)
![문장확률](fig/sent_prob_2.png)

In [69]:
def pred_sent_prob(sent):
    model.eval()
    sent_prob=0
    with torch.no_grad():
        # 1. 모델 입력 및 정답 문장에 대한 단어 indexing
        corpus=Corpus(sent,device)
        idx_src, idx_tgt = corpus.indexing(sent)
        
        # 2. initial hidden 생성
        hidden = model.init_hidden(1)
    
        # 3. LM의 결과(확률분포) 생성
        output, hidden = model(idx_src,hidden)
        print(output.shape)
        
        # 4. 모델 확률분포로부터 정답 단어의 각 index에 대한 Log 확률 값 추출.
        for i in range(len(output)):
            print(output[i][idx_tgt[i]])
            
        # 5. log 확률의 합.
        for i in range(len(output)):
            sent_prob+=output[i][idx_tgt[i]]
        # 6. 결과 return (return type: float)
        return sent_prob

In [70]:
# load saved model
with open('./model.pt', 'rb') as f:
    print('load model from: ./model.pt')
    model = torch.load(f).to(device)

    print('log prob of [the dog bark .]: {:3.3f}\n'.format(pred_sent_prob([['the', 'dog', 'bark', '.']])))
    print('log prob of [the cat bark .]: {:3.3f}\n'.format(pred_sent_prob([['the', 'cat', 'bark', '.']])))

    print('log prob of [boy am a i .]: {:3.3f}\n'.format(pred_sent_prob([['boy', 'am', 'a', 'i', '.']])))
    print('log prob of [i am a boy .]: {:3.3f}\n'.format(pred_sent_prob([['i', 'am', 'a', 'boy', '.']])))


load model from: ./model.pt
torch.Size([5, 20004])
tensor(-2.1424, device='cuda:0')
tensor(-9.2958, device='cuda:0')
tensor(-21.4679, device='cuda:0')
tensor(-2.8360, device='cuda:0')
tensor(-6.5152, device='cuda:0')
log prob of [the dog bark .]: -42.257

torch.Size([5, 20004])
tensor(-2.1424, device='cuda:0')
tensor(-9.2958, device='cuda:0')
tensor(-21.4679, device='cuda:0')
tensor(-2.8360, device='cuda:0')
tensor(-6.5152, device='cuda:0')
log prob of [the cat bark .]: -42.257

torch.Size([6, 20004])
tensor(-8.1613, device='cuda:0')
tensor(-23.8846, device='cuda:0')
tensor(-4.5753, device='cuda:0')
tensor(-0.4254, device='cuda:0')
tensor(-7.9259, device='cuda:0')
tensor(-7.4939, device='cuda:0')
log prob of [boy am a i .]: -52.466

torch.Size([6, 20004])
tensor(-2.1424, device='cuda:0')
tensor(-6.8701, device='cuda:0')
tensor(-3.9096, device='cuda:0')
tensor(-12.2394, device='cuda:0')
tensor(-19.8135, device='cuda:0')
tensor(-0.6636, device='cuda:0')
log prob of [i am a boy .]: -45.63

# 과제2: 다음 단어 예측
![다음단어예측](fig/next_word.png)

In [99]:
def pred_next_word(partial_sent, topN=3):
    model.eval()
    result=[]
    with torch.no_grad():
        # 1. 모델 입력 및 정답 문장에 대한 단어 indexing
        #corpus=Corpus(partial_sent,device)
        idx_src, idx_tgt = corpus.indexing(partial_sent)
        
        # 2. initial hidden 생성
        hidden = model.init_hidden(1)
        
        # 3. LM의 결과(확률분포) 생성
        output, hidden = model(idx_src,hidden)
        print(output.shape)
        
        # 4. topN에 해당하는 다음단어의 word index 추출 (Hint: torch.topk() 활용)
        val,idx=torch.topk(output[-1],topN)
        print("word index: ",idx,"\n")
      
        # 5. word index --> word 로 변환
        for (key, val) in (corpus.dictionary.word2idx.items()):
            for j in range(len(idx)):
                if idx[j] == val:
                    result.append(key)
                    print(key)
                
        # 6. topN word list 반환 (return type: list)
        return result
        

In [100]:
partial_sent = [['the', 'next', 'word']]
N=3
candidates = pred_next_word(partial_sent, topN=N)

# print 
partial_sent = ' '.join(partial_sent[0])
print('Top {0} next words for a partial sentence [{1}] is: '.format(N, partial_sent))
print('===>', candidates)

torch.Size([4, 20004])
word index:  tensor([14, 12,  7], device='cuda:0') 

of
was
is
Top 3 next words for a partial sentence [the next word] is: 
===> ['of', 'was', 'is']
