# RNN을 사용한 문장 생성
## 언어 모델을 사용한 문장 생성
문장 생성 구현

In [7]:
import numpy as np
from common.functions import softmax
import nbimporter
from ch06 import BetterRnnlm
from ch06 import Rnnlm

In [9]:
class RnnlmGen(Rnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100):
        word_ids = [start_id]

        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x)
            p = softmax(score.flatten())

            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))
            
        return word_ids

In [10]:
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

## 가중치 없는 경우
model = RnnlmGen()

start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you memphis erbamont balloon wcrs glasnost clue italy sponsors proceeding controls admit spending roth score farmers manipulation charleston nasty guerrillas technicians skiers marcos lawsuits allowing leaders probe overdue erupted publicly rewrite ohbayashi formal newsletter seemingly discretionary fund-raising pick forecasts sloan northrop fossil free parliamentary eugene constant rhetoric exporter amgen scottish cheapest maintenance pointing politically come regular-season electricity truth conservatives flowing colors creative copyright bureaucrat lasted precedent hundreds gte consisting cities uneasy architects ind. stevenson severe month pencil refugees haven existence pete cheney cross-border loral nigel signing malcolm action blocked henry critics morning kodak play suggestion bethlehem junk expertise strengthen tire


In [30]:
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

## 직접 학습한 가중치 불러오기
model = RnnlmGen()
model.load_params('Rnnlm.pkl')

start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you teaches mass. powerhouse destroyed ferry hospitals kgb visits murata daughters menlo shamir program lacked secretaries job understands beauty lasting bans mortgage action tpa superior suspect real-estate perform discrepancies following nikkei english rosenthal committee due kan. hydro-quebec atmospheric killer bench scaled assumptions lender appreciation texaco secretary institutions beneficiaries dry liquor ignoring announcing those bizarre mesa namibia transmission unfilled bsn ' glazer merchandising reviewing asset acceptances pervasive families sees beginning owners subscribe massive assessing scarce capitalized matthews athletic loral assumptions person suffer courtaulds contel fund energy horses s. maintaining roger defend downward slight scheme modern americans heavy gain stiff ohbayashi compatible


In [38]:
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

## 교재 가중치 불러오기
model = RnnlmGen()
model.load_params('Rnnlm_origin.pkl')

start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you finish fast-food should crops edged referring missed videocassette scattered terrorism participating insurers weigh telerate coming ask fares drug mtm insulin fix hbo behind stimulators conference rudolph predicting worst themes nicaragua proceed weakened declare macmillan stolen banc votes employs announce a.g. impact mural august hopkins wyss glenn dependent femina counterpart effects restructured gatt preventing mca climb tickets character cure sagging elephant hotels british summit cairo next sen. accurately recognizing notified indiana ignore mind renewal year-to-year rothschilds two-thirds lately turkey digest collection lets part linear las televised so-called privately bag beach optical execution thought articles formed expired violating discounts break politics


더 좋은 문장으로

In [46]:
class BetterRnnlmGen(BetterRnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100):
        word_ids = [start_id]

        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x).flatten()
            p = softmax(score).flatten()

            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))

        return word_ids

    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state)

In [None]:
import sys
sys.path.append('..')
from common.np import *
from dataset import ptb


corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)


model = BetterRnnlmGen()
model.load_params('./BetterRnnlm.pkl')

# start 문자와 skip 문자 설정
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]
# 문장 생성
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')

print(txt)


model.reset_state()

start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]

for x in start_ids[:-1]:
    x = np.array(x).reshape(1, 1)
    model.predict(x)

word_ids = model.generate(start_ids[-1], skip_ids)
word_ids = start_ids[:-1] + word_ids
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print('-' * 50)
print(txt)

## seq2seq

덧셈 데이터셋

In [56]:
from dataset import sequence

(x_train, t_train), (x_test, t_test) = sequence.load_data('./addition.txt', seed=1984)
char_to_id, id_to_char = sequence.get_vocab()

print(x_train.shape, t_train.shape)
print(x_test.shape, t_test.shape)
print()

print(x_train[0])
print(t_train[0])
print()

print(''.join([id_to_char[c] for c in x_train[0]]))
print(''.join([id_to_char[c] for c in t_train[0]]))

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)

[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]

71+118 
_189 


## seq2seq 구현
Encoder 클래스

In [57]:
from common.time_layers import *

class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4*H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        lstm_b = (rn(4*H)).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None
    
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]
    
    def backward(self, dh):
        dhs = np.zeors_like(self.hs)
        dhs[:, -1, :] = dh

        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout

Decoder 클래스