In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import numpy as np
from konlpy.tag import Mecab;tagger=Mecab()
from collections import Counter
%matplotlib inline  

* https://arxiv.org/pdf/1703.00955.pdf

In [2]:
USE_CUDA = torch.cuda.is_available()

In [3]:
data = open('../../dataset/corpus/naver_movie.txt','r',encoding='utf-8').readlines()

In [4]:
data = data[1:]

In [5]:
data = [[d.split('\t')[1],d.split('\t')[2][:-1]] for d in data]

In [6]:
from collections import Counter

In [7]:
distibution = [d[1] for d in data]

In [8]:
Counter(distibution)

Counter({'0': 100000, '1': 100000})

In [9]:
len(data)

200000

In [10]:
SEQ_LENGTH=15

In [11]:
train=[]

In [12]:
for t in data:
    t0 = t[0]
    t0 = t0.replace("<br>","")
    t0 = t0.replace("/","")
    
    token0 = tagger.morphs(t0)
    
    if len(token0)>=SEQ_LENGTH:
        token0= token0[:SEQ_LENGTH-1]
    token0.append("<EOS>")

    while len(token0)<SEQ_LENGTH:
        token0.append('<PAD>')
    
    train.append([token0,token0,t[1]])

In [13]:
random.shuffle(train)

In [14]:
word2index={"<PAD>":0,"<SOS>":1,"<EOS>":2,"<UNK>":3}

for t in train:
    for token in t[0]:
        if token not in word2index:
            word2index[token]=len(word2index)

index2word = {v:k for k,v in word2index.items()}

In [15]:
def remove_list(x):
    del x[:]
    del x

In [16]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix["<UNK>"], seq))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
    return tensor


flatten = lambda l: [item for sublist in l for item in sublist]

In [17]:
train_x=[]
train_y=[]
code_labels=[]
lengths=[]
for tr in train:
    temp = prepare_sequence(tr[0], word2index)
    temp = temp.view(1,-1)
    train_x.append(temp)

    temp2 = prepare_sequence(tr[1],word2index)
    temp2 = temp2.view(1,-1)
    train_y.append(temp2)
    
    length = [t for t in tr[1] if t !='<PAD>']
    lengths.append(len(length))
    code_labels.append(Variable(torch.LongTensor([int(tr[2])])).cuda() if USE_CUDA else Variable(torch.LongTensor([int(tr[2])])))

In [29]:
train_data = list(zip(train_x,train_y,code_labels))

In [30]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        x,y,c = zip(*batch)
        x,y,c = torch.cat(x),torch.cat(y),torch.cat(c)
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        
        yield (x,y,c)

In [31]:
def makeStructuredCode(batch,num_label):
    pass

In [32]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size,latent_size=10,n_layers=1):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.Wmu= nn.Linear(hidden_size,latent_size)
        self.Wsigma = nn.Linear(hidden_size,latent_size)
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,batch_first=True)
    
    def reparametrize(self, mu, log_var):
        """"z = mean + eps * sigma where eps is sampled from N(0, 1)."""
        eps = Variable(torch.randn(mu.size(0), mu.size(1))).cuda() if USE_CUDA else Variable(torch.randn(mu.size(0), mu.size(1)))
        z = mu + eps * torch.exp(log_var/2)    # 2 for convert var to std
        return z
    
    def forward(self, input,train=True):
        hidden = Variable(torch.zeros(self.n_layers, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.zeros(self.n_layers, input.size(0), self.hidden_size))
        
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded, hidden)
        mu = self.Wmu(hidden[-1])
        log_var = self.Wsigma(hidden[-1])
        z = self.reparametrize(mu, log_var)
        
        return z,mu,log_var

In [33]:
encoder_test = Encoder(len(word2index), 100,10,2)
print(encoder_test)

Encoder (
  (Wmu): Linear (100 -> 10)
  (Wsigma): Linear (100 -> 10)
  (embedding): Embedding(51127, 100)
  (gru): GRU(100, 100, num_layers=2, batch_first=True)
)


In [35]:
class Generator(nn.Module):
    def __init__(self, hidden_size, output_size,latent_size=10,code_size=2, n_layers=1):
        super(Generator, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.Wz = nn.Linear(latent_size+code_size,hidden_size)
        self.tanh = nn.Tanh()
        
        # Define the layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        #self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, self.n_layers,batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, input,latent,code,lengths,seq_length,training=True):
        
        # Get the embedding of the current input word
        embedded = self.embedding(input)
        #embedded = self.dropout(embedded)
        latent_code = torch.cat((latent,code),1)
        # h0
        hidden = self.tanh(self.Wz(latent_code)).view(self.n_layers,input.size(0),-1) 

        decode=[]
        # Apply GRU to the output so far
        for i in range(seq_length):
            
            _, hidden = self.gru(embedded, hidden)
            score = self.out(hidden.view(hidden.size(0)*hidden.size(1),-1))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            _,input = torch.max(softmaxed,1)
            embedded = self.embedding(input)
            #embedded = self.dropout(embedded)
        
        # 요고 주의! time-step을 column-wise concat한 후, reshape!!
        scores = torch.cat(decode,1)
        remove_list(decode)
        
        return scores.view(input.size(0)*seq_length,-1)

In [None]:
class  Discriminator(nn.Module):
    
    def __init__(self, embed_num,embed_dim,class_num,kernel_num,kernel_sizes,dropout):
        super(Discriminator,self).__init__()
        #self.args = args
        
        V = embed_num # num of vocab
        D = embed_dim # dimenstion of word vector
        C = class_num # num of class
        Ci = 1
        Co = kernel_num # 100
        Ks = kernel_sizes # [3,4,5]

        self.embed = nn.Embedding(V, D)
        #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        
        # kernal_size = (K,D) : D는 단어 벡터 길이라 픽스, K 사이즈만큼 슬라이딩, 스트라이드는 1
        
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x


    def forward(self, x,train=True):
        x = self.embed(x) # (N,W,D)
        
        #if self.args.static:
        #    x = Variable(x)

        x = x.unsqueeze(1) # (N,Ci,W,D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)


        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        if train:
            x = self.dropout(x) # (N,len(Ks)*Co)
        logit = self.fc1(x) # (N,C)
        return logit

In [36]:
HIDDEN_SIZE = 300
LATENT_SIZE = 10
CODE_SIZE = 2
BATCH_SIZE=32
STEP=5
LEARNING_RATE=0.001

In [37]:
encoder =  Encoder(len(word2index), HIDDEN_SIZE,LATENT_SIZE, 2)
generator = Generator(HIDDEN_SIZE,len(word2index),LATENT_SIZE,CODE_SIZE)
discriminator = Discriminator(len(word2index),100,2,30,[3,4,5],0.8)
if USE_CUDA:
    encoder = encoder.cuda()
    generator = generator.cuda()

Recon = nn.CrossEntropyLoss()
enc_optim= torch.optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
dec_optim = torch.optim.Adam(generator.parameters(),lr=LEARNING_RATE)

In [39]:
for step in range(STEP):
    for i,(x,y,c) in enumerate(getBatch(BATCH_SIZE,train_data)):
        #KCA = 0.3
        encoder.zero_grad()
        generator.zero_grad()

        generator_input = Variable(torch.LongTensor([[word2index['<SOS>']]*BATCH_SIZE])).transpose(1,0)

        if USE_CUDA:
            generator_input = generator_input.cuda()

        latent, mu, log_var = encoder(x)
        code = Variable(torch.randn([BATCH_SIZE,2]).uniform_(0,1)).cuda() if USE_CUDA else Variable(torch.randn([BATCH_SIZE,2]).uniform_(0,1))


        score = generator(generator_input,latent,code,lengths,SEQ_LENGTH)
        recon_loss=Recon(score,y.view(-1))
        kld_loss = torch.sum(0.5 * (mu**2 + torch.exp(log_var) - log_var -1))
        #checker.append((recon_loss,kld_loss))

    #     KL_COST_ANNEALING
        if recon_loss.data.numpy()[0]<1.5:
            KCA = 1.0

        else:
            KCA = 0.0
        ELBO = recon_loss+KCA*kld_loss
    #     ELBO = recon_loss+kld_loss
        loss = ELBO.data.numpy()[0]

        ELBO.backward()



        torch.nn.utils.clip_grad_norm(encoder.parameters(), 5.0)
        torch.nn.utils.clip_grad_norm(generator.parameters(), 5.0)

        dec_optim.step()
        enc_optim.step()

        if i % 100==0:
            #kindex+=1
            print("[%d/%d] ELBO : %.4f , RECON : %.4f & KLD : %.4f" % (i,STEP,ELBO.data.numpy()[0],
                                                                                  recon_loss.data.numpy()[0],
                                                                                  kld_loss.data.numpy()[0]))

[0/5] ELBO : 10.6195 , RECON : 10.6195 & KLD : 487.9975
[100/5] ELBO : 5.6262 , RECON : 5.6262 & KLD : 7441.5859
[200/5] ELBO : 5.3010 , RECON : 5.3010 & KLD : 7383.5635
[300/5] ELBO : 5.4912 , RECON : 5.4912 & KLD : 6913.9248
[400/5] ELBO : 5.1097 , RECON : 5.1097 & KLD : 6672.9014


KeyboardInterrupt: 

## test 

### Recon

In [67]:
index=random.choice(range(300))
latent,_,_ = encoder(inputs[index].view(1,-1))
decoder_input = Variable(torch.LongTensor([[SOS_token]])).transpose(1,0)
#context = Variable(torch.randn([1,1,HIDDEN_SIZE])) 
recon = decoder(decoder_input,latent,lengths,SEQ_LENGTH,False)

v,i = torch.max(recon,1)

decoded=[]
for t in range(i.size()[0]):
    decoded.append(index2word[i.data.numpy()[t][0]])
    
print('Q: ', ' '.join([i for i in train[index][0] if i !='PAD' and i != 'EOS'])+'\n')
print('A: ', ' '.join([i for i in decoded if i !='PAD' and i != 'EOS'])+'\n')

Q:  결혼식 에 축화 화환 보낼려고 하 는데 시간 도 정하 면 맞출 수 있

A:  문 비서 축화 화환 보낼려고 하 는데 시간 도 정하 면 맞출 수 있



### Generate

In [195]:
decoder_input = Variable(torch.LongTensor([[SOS_token]])).transpose(1,0)
context = Variable(torch.randn([1,10])) 
recon = decoder(decoder_input,context,lengths,SEQ_LENGTH,False)

v,i = torch.max(recon,1)

decoded=[]
for t in range(i.size()[0]):
    decoded.append(index2word[i.data.numpy()[t][0]])

print('A: ', ' '.join([i for i in decoded if i !='PAD' and i != 'EOS'])+'\n')

A:  화환 꽃 부탁 요청



In [204]:
compare = [' '.join(tagger.morphs(d)) for d in data]

In [212]:
def generate(num):
    result=[]
    counter=0
    while counter<num:
        decoder_input = Variable(torch.LongTensor([[SOS_token]])).transpose(1,0)
        context = Variable(torch.randn([1,10])) 
        recon = decoder(decoder_input,context,lengths,SEQ_LENGTH,False)

        v,i = torch.max(recon,1)

        decoded=[]
        for t in range(i.size()[0]):
            dd = index2word[i.data.numpy()[t][0]]
            if dd not in decoded:
                decoded.append(dd)
        
        r = ' '.join([i for i in decoded if i !='PAD' and i != 'EOS'])
        
        if r not in compare:
            result.append(r)
            counter+=1
    return result

In [213]:
generated = generate(300)

In [214]:
len(generated)

300

In [217]:
with open('../../dataset/generated_FLOWER300.txt','w',encoding='utf-8') as f:
    for g in generated:
        f.write(g+'\n')

In [218]:
torch.save(encoder,'../../trained/flower_encoder.pkl')
torch.save(decoder, '../../trained/flower_generator.pkl')
# model = torch.load('model.pkl')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
