In [1]:
import torch 
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import sys
sys.path.append('/run/media/tignjatov/Hard Drive/Finki/7semestar/NLP/final_project/politeness-increase/data_prep')
from sentence_dataset_class import ProcessedSentences
from sentence_processing import build_vocab,sentence_processing
sys.path.append('/run/media/tignjatov/Hard Drive/Finki/7semestar/NLP/final_project/politeness-increase/transformer_testing')
from tomislav_transformer import Seq2SeqTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def stringify_series(df):
    df['input_data'] = df['input_data'].astype('string')
    df['output_data'] = df['output_data'].astype('string')
    return df

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
df_train = pd.read_json('data/train_data.json')
df_test = pd.read_json('data/test_data.json')

In [5]:
token_transform = get_tokenizer('basic_english')

In [6]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [7]:
train_input_vocab = build_vocab(df_train['input_data'],token_transform,special_symbols)
train_output_vocab = build_vocab(df_train['output_data'],token_transform,special_symbols)

In [8]:
train_input_sentences = [sentence_processing(sentence,train_input_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df_train['input_data'].values]
train_output_sentences = [sentence_processing(sentence,train_output_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df_train['output_data'].values]

In [9]:
test_input_sentences = [sentence_processing(sentence,train_input_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df_test['input_data'].values]
test_output_sentences = [sentence_processing(sentence,train_output_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df_test['output_data'].values]

In [10]:
# train_input_sentences = pad_sequence(train_input_sentences,batch_first=True,padding_value=PAD_IDX)
# train_output_sentences = pad_sequence(train_output_sentences,batch_first=True,padding_value=PAD_IDX)
# test_input_sentences = pad_sequence(test_input_sentences,batch_first=True,padding_value=PAD_IDX)
# test_output_sentences = pad_sequence(test_output_sentences,batch_first=True,padding_value=PAD_IDX)

train_input_sentences = pad_sequence(train_input_sentences,batch_first=False,padding_value=PAD_IDX)
train_output_sentences = pad_sequence(train_output_sentences,batch_first=False,padding_value=PAD_IDX)
test_input_sentences = pad_sequence(test_input_sentences,batch_first=False,padding_value=PAD_IDX)
test_output_sentences = pad_sequence(test_output_sentences,batch_first=False,padding_value=PAD_IDX)

In [11]:
train_dataset = ProcessedSentences(
    input_data = train_input_sentences,
    output_data = train_output_sentences,
)

In [12]:
test_dataset = ProcessedSentences(
    input_data = test_input_sentences,
    output_data = test_output_sentences
)

In [13]:
# train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)
# test_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [14]:
torch.manual_seed(0)
input_vocab_size = len(train_input_vocab)
output_vocab_size = len(train_output_vocab)
emb_size = 512
n_head = 8
ffn_hid_dim = 512
batch_size = 32
num_encoder_layers = 3
num_decoder_layers = 3

In [15]:
transformer = Seq2SeqTransformer(
    num_encoder_layers,
    num_decoder_layers,
    emb_size,
    n_head,
    input_vocab_size,
    output_vocab_size,
    ffn_hid_dim)

In [16]:
transformer = transformer.to(device)

In [17]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [18]:
def train_epoch(model,optimizer):
    model.train()
    losses = 0
    train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
    for input_sent, output_sent in train_dataloader:
        input_sent = input_sent.to(device)
        output_sent = output_sent.to(device)
        
        output_input = output_sent[:-1,:]
        
        input_mask, output_mask, input_padding_mask, output_padding_mask = create_mask(input_sent,output_input)
        logits = model(
            input_sent,
            output_input,
            input_mask,
            output_mask,
            input_padding_mask,
            output_padding_mask,
            input_padding_mask)
        optimizer.zero_grad()
        
        output_out = output_sent[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), output_out.reshape(-1))
        loss.backward()
        
        optimizer.step()
        losses += loss.item()
    return losses/len(train_dataloader)
def evaluate(model):
    model.eval()
    losses = 0
    test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=True)
    for input_sent, output_sent in test_dataloader:
        input_sent = input_sent.to(device)
        output_sent = output_sent.to(device)
        
        output_input = output_sent[:,:-1]
        input_mask, output_mask, input_padding_mask, output_padding_mask = create_mask(input_sent,output_input)
        logits = model(
            input_sent,
            output_input,
            input_mask,
            output_mask,
            input_padding_mask,
            output_padding_mask,
            input_padding_mask)
        
        
        output_out = output_sent[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), output_out.reshape(-1))
        
        losses += loss.item()
    return losses/len(test_dataloader)
    
    

In [19]:
import tqdm
num_epochs = 10

for epoch in range(1,num_epochs+1):
    train_loss = train_epoch(transformer,optimizer)
    val_loss = evaluate(transformer)
    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}")

RuntimeError: CUDA out of memory. Tried to allocate 12.97 GiB (GPU 0; 3.94 GiB total capacity; 360.82 MiB already allocated; 2.28 GiB free; 372.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF