In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from pandas import DataFrame as df
from matplotlib import pyplot as plt
import os
import gzip
import csv

In [2]:
os.chdir('./HAAFOR')
torch.cuda.empty_cache()

In [3]:
import sentencepiece as spm
vocab_list = pd.read_csv('./data/merged_20000.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
sp = spm.SentencePieceProcessor()
vocab_file = "./data/merged_20000.model"
sp.load(vocab_file)

True

In [4]:
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__


In [5]:
config=Config({'n_layers':1,'n_head':1,'d_model':256,'n_token':0,'hidden_dim':4*256,'padding_idx':1,'seq_len':1000,'batch_size':32,'dropout':0.1,'max_len':5000})

In [6]:
# 데이터 분석하기 전에 torchtext를 활용하기
# torchtext에 있는 사전 훈련된 임베딩 벡터 활용하기
# torchtext iterator는 drop_last를 못한다.
import torchtext
TEXT=torchtext.data.Field(sequential=True,use_vocab=True,init_token='<s>',eos_token='</s>',tokenize=sp.encode_as_pieces,fix_length=config.seq_len,batch_first=False,lower=True,pad_token='<pad>',unk_token='<unk>') 
ISNEXT=torchtext.data.Field(sequential=False,use_vocab=False,batch_first=False,is_target=True) 
Train_data=torchtext.data.TabularDataset('./data/train_data_10000_swap.csv',format='csv',fields=[('A',TEXT),('B',TEXT),('NEXT',ISNEXT)],skip_header=True) # 이 때 train data는 sentence형태 이여야함.(tokenized가 되지 않은 상태)

In [7]:
TEXT.build_vocab(Train_data)
print(len(TEXT.vocab.stoi)) # 18201 <- 내가 적당하게 seq(1000)를 잘라냈기 때문에 발생한다.
config['n_token']=len(TEXT.vocab.stoi)

18201


In [8]:
# train loader
train_loader=torchtext.data.Iterator(Train_data,batch_size=config.batch_size)

In [9]:
# model - attention model을 활용한다.
import math
# positional encoding

class PositionalEncoding(nn.Module):

    def __init__(self, config):
        super(PositionalEncoding, self).__init__()
        self.config=config
        self.dropout = nn.Dropout(p=self.config.dropout)
        self.pe = torch.zeros(config.max_len, config.d_model).to(device)
        position = torch.arange(0, config.max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, config.d_model, 2).float() * (-math.log(10000.0) / config.d_model))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0).transpose(0,1) # max len, 1, d_model
        

    def forward(self, x):
        '''
        x shape : seq len, batch size, d model
        '''
        self.pe=self.pe.to(x.device)
        x = x + self.pe[:x.size(0),:,:] # 후 항 shape : seq len, 1, d model
        return self.dropout(x)

In [10]:
class TransformerModel(nn.Module):

    def __init__(self, config):
        super(TransformerModel, self).__init__()
        self.config=config
        from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
        self.mask = None
        self.pos_encoder = PositionalEncoding(self.config)
        encoder_layers = TransformerEncoderLayer(self.config.d_model,self.config.n_head, self.config.hidden_dim, self.config.dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, self.config.n_layers)
        decoder_layers = TransformerDecoderLayer(self.config.d_model,self.config.n_head, self.config.hidden_dim, self.config.dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layers, self.config.n_layers)
        self.embedding = nn.Embedding(self.config.n_token, self.config.d_model)
        self.fc=nn.Sequential(nn.Linear(self.config.d_model*self.config.seq_len,self.config.d_model),nn.ReLU(),nn.Linear(self.config.d_model,1),nn.Sigmoid())
    def gen_padding_mask(self, input):
        '''
        input shape : seq len, batch size
        embedding(input) : seq len, batch size, d_model
        mask shape : seq len, batch size <- seq len에서 padding idx인 녀석은 1, 나머지는 0
        근데 TransformerEncoder에 넣어주기 위해선 batch size, seq len으로 바꿔줘야 한다.
        subsquent mask와는 다르다. -> TransformerEncoder에선 src_key_padding_mask의 input으로 들어감
        '''
        mask=input.eq(self.config.padding_idx).T 
        return mask
    
    def forward(self, src,tgt):
        device=src.device
        src_key_padding_mask = self.gen_padding_mask(src)
        tgt_key_padding_mask = self.gen_padding_mask(tgt)
        self.src_key_padding_mask = src_key_padding_mask.to(device)
        self.tgt_key_padding_mask = tgt_key_padding_mask.to(device)
        src_out = self.embedding(src) * math.sqrt(self.config.d_model)
        src_out = self.pos_encoder(src_out)
        src_out = self.transformer_encoder(src_out)
        tgt_out = self.embedding(tgt) * math.sqrt(self.config.d_model)
        tgt_out = self.pos_encoder(tgt_out)
        out = self.transformer_decoder(tgt_out,src_out,tgt_key_padding_mask=self.tgt_key_padding_mask,
                                         memory_key_padding_mask=self.src_key_padding_mask)
        out = out.transpose(0,1) # batch size, seq_len, d_model
        out = out.reshape(-1,self.config.seq_len*self.config.d_model)
        output = self.fc(out) #  batch size, 1
        return output.squeeze(-1) # batch size <- 그냥 squeeze()로 하면 1인 녀석이 다 지워짐...

In [12]:
import time
def train():
    model.train() # 학습 모드를 시작합니다.
    total_loss = 0.
    start_time = time.time()
    n=0
    for _,ok in enumerate(train_loader):
        src,tgt,is_next=ok.A,ok.B,ok.NEXT
        src=src.to(device)
        tgt=tgt.to(device)
        is_next=is_next.float().to(device)
        optimizer.zero_grad()
        output = model(src,tgt)
        try :
            loss = criterion(output, is_next)
        except:
            print(output)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # clip을 해준다.
        optimizer.step()
        n+=1
        total_loss += loss.item()
        
        if _ % 100 == 0 and _ > 0:
            print('-' * 89)
            print('| epoch %d | batches %d | loss %.2f | processed_time %.1f'%(
                    epoch, _, total_loss/n, time.time()-start_time))      
            print('-' * 89)
    

In [11]:
torch.cuda.empty_cache()

In [None]:
# train 해보즈아
import time
device = 'cuda:0'
model = TransformerModel(config).to(device)
# model = torch.nn.DataParallel(model).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())
epochs = 10000
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    model.train() # 학습 모드를 시작합니다.
    total_loss = 0.
    start_time = time.time()
    n=0
    for _,ok in enumerate(train_loader):
        src,tgt,is_next=ok.A,ok.B,ok.NEXT
        src=src.to(device)
        tgt=tgt.to(device)
        is_next=is_next.float().to(device)
        optimizer.zero_grad()
        output = model(src,tgt)
        loss = criterion(output, is_next)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # clip을 해준다.
        optimizer.step()
        n+=1
        total_loss += loss.item()
        
        if epoch%100==0 and epoch>0 and _ % 100 == 0 and _ > 0:
            print('-' * 89)
            print('| epoch %d | batches %d | loss %.2f | processed_time %.1f'%(
                    epoch, _, total_loss/n, time.time()-start_time))      
            print('-' * 89)
    
    