In [1]:
import math, copy, sys
import torch
import argparse
from linformer_pytorch import LinformerEncDec

from scripts.MoveData import *
# from scripts.Transformer import *
from scripts.Linformer import *
from scripts.TalkTrain import *

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


singularity exec --nv --overlay /scratch/kp2684/chatbot_transformer.ext3:ro /scratch/work/public/singularity/cuda11.2.2-cudnn8-devel-ubuntu20.04.sif /bin/bash -c "source /ext3/env.sh; python main.py --epoch 200 --batch 32 --train data2_train_9010 --test data2_test_9010 --weight rtx8000_data2_9010_00004_transformer_false_warmup --modeler transformer --lr 0.0004 --scheduler warmup --shuffle False --verbose False"


In [3]:
train = "data2_train_9010"
test = "data2_test_9010"
# train = "smalldata_train_9010"
# test = "smalldata_test_9010"
# train = "smalldata2_train_9010"
# test = "smalldata2_test_9010"
weight = "testrun2"
batch = 128
epoch = 200
shuffle = False
verbose = True
modeler = "linformer"
lr = 0.0003

In [4]:
opt = Options(batchsize=batch, device=torch.device(device), epochs=epoch, lr=lr, max_len = 128, save_path = f'saved/weights/{weight}')

In [None]:
print('==> Load Dataset..')
train_data_iter, train_infield, train_outfield, train_opt = json2datatools(path = f'saved/data/{train}.json', opt=opt, train=True, shuffle=True)
print('train input vocab size', len(train_infield.vocab), 'train reply vocab size', len(train_outfield.vocab))
test_data_iter, test_infield, test_outfield, test_opt = json2datatools(path = f'saved/data/{test}.json', opt=opt, train=False, shuffle=shuffle)
print('test input vocab size', len(test_infield.vocab), 'test reply vocab size', len(test_outfield.vocab))
print("==> Number of train steps per epoch",num_batches(train_data_iter))
print("==> Number of test steps per epoch",num_batches(train_data_iter))

==> Load Dataset..
train input vocab size 31507 train reply vocab size 30876


def t_trainer(model, train_data_iterator, train_options, test_data_iterator, test_options, optimizer, scheduler, scheduler_name):

    if torch.cuda.is_available() and train_options.device == torch.device("cuda"):
        print("==> a GPU was detected, model will be trained on GPU")
        model = model.cuda()
    else:
        print("==> training on cpu")

    model.train()
    start = time.time()
    best_loss = 100
    
    train_tracker, test_tracker = [], []
    iters = len(train_data_iterator)
    for epoch in range(train_options.epochs):
        train_total_loss = 0
        for i, batch in enumerate(train_data_iterator): 
            src = batch.listen.transpose(0,1)
            trg = batch.reply.transpose(0,1)
            # print("src", src.shape)
            # print("trg", trg.shape)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input, train_options)
            preds = model(src, src_mask, trg_input, trg_mask)
            ys = trg[:, 1:].contiguous().view(-1)
            optimizer.zero_grad()
            train_batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                         ys, ignore_index = train_options.trg_pad)
            train_batch_loss.backward()
            optimizer.step()
            train_total_loss += train_batch_loss.item()

            if scheduler_name == "cosine": 
                scheduler.step(epoch + i / iters)
            if scheduler_name == "warmup": 
                scheduler.step()
            # print("batch loss", train_batch_loss)
        if scheduler_name == "warmup":
            scheduler.print_lr(epoch+i)
        train_epoch_loss = train_total_loss/(num_batches(train_data_iterator)+1)
        train_tracker.append(train_epoch_loss)
        if scheduler_name == "plateau": 
            scheduler.step(train_epoch_loss)

        model.eval()
        test_total_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(test_data_iterator): 
                src = batch.listen.transpose(0,1)
                trg = batch.reply.transpose(0,1)
                trg_input = trg[:, :-1]
                src_mask, trg_mask = create_masks(src, trg_input, test_options)
                preds = model(src, src_mask, trg_input, trg_mask)
                ys = trg[:, 1:].contiguous().view(-1)
                test_batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                             ys, ignore_index = test_options.trg_pad)
                test_total_loss += test_batch_loss.item()

            test_epoch_loss = test_total_loss/(num_batches(test_data_iterator)+1)
            test_tracker.append(test_epoch_loss)

        # if scheduler_name == "plateau": 
        #     scheduler.step(test_epoch_loss)
                    
        model.train()
        

        if train_epoch_loss < best_loss:
            best_loss = train_epoch_loss
            torch.save(model.state_dict(), train_options.save_path)
        # if test_epoch_loss < best_loss:
        #     best_loss = test_epoch_loss
        #     torch.save(model.state_dict(), train_options.save_path)
        print("%.3fm: train epoch *%d*, loss = *%.3f*" %((time.time() - start)//60, epoch+1, train_epoch_loss), end=", ")
        print("%.3fm: test epoch *%d*, loss = *%.3f*, best loss = *%.3f*" %((time.time() - start)//60, epoch+1, test_epoch_loss, best_loss) , flush=True)
        train_total_loss = 0
        test_total_loss = 0
        return train_tracker, test_tracker

In [None]:
emb_dim, n_layers, heads, dropout = 512, 6, 8, 0.1

In [None]:
epoch = 200
shuffle = False
verbose = True


t_model = Transformer(
    len(train_infield.vocab), 
    len(train_outfield.vocab), 
    emb_dim, 
    n_layers, 
    heads, 
    dropout)

In [None]:
l_model = Linformer(
    len(train_infield.vocab), 
    len(train_outfield.vocab), 
    emb_dim, 
    n_layers, 
    heads, 
    dropout)

l_model

In [9]:
import torch
from linformer_pytorch import LinformerEncDec

l_model = LinformerEncDec(
    enc_num_tokens=len(train_infield.vocab), #vocaburary size
    enc_input_size=opt.max_len, # Dimension 1 of the input of (0, 1 ,2) dimens
    enc_channels=emb_dim, # decoder channels dimension should be divisible by number of heads 
    enc_dropout=dropout, # Dropout for attention
    enc_dim_k=128, # The second dimension of the P_bar matrix from the paper
    enc_dim_ff=2048, # Dimension in the feed forward network
    enc_dropout_ff=dropout, # Dropout for feed forward network
    enc_nhead=heads, # Number of attention heads
    enc_depth=n_layers, # How many times to run the model
    
    activation="gelu", # What activation to use. Currently, only gelu and relu supported, and only on ff network.
    dec_num_tokens=len(train_outfield.vocab), # Dimension 1 of the input
    dec_input_size=opt.max_len, # Dimension 1 of the input
    dec_channels=emb_dim, # decoder channels dimension should be divisible by number of heads 
    dec_dropout=dropout, # Dropout for attention
    dec_dim_k=128, # The second dimension of the P_bar matrix from the paper
    dec_dim_ff=2048, # Dimension in the feed forward network
    dec_dropout_ff=dropout, # Dropout for feed forward network
    dec_nhead=heads, # Number of attention heads
    dec_depth=n_layers, # How many times to run the model
)
# l_model = Padder(l_model)
# x = torch.randint(1,len(train_infield.vocab),(4,batch))
# print(x.shape)
# y = torch.randint(1,len(train_outfield.vocab),(8,batch))
# output = l_model(x,y)# l_model.train()
# print(output.shape)
# enc_output = encoder(x, input_mask=x_mask)
# print(enc_output.shape) # (1, 512, 128)
# dec_output = decoder(y, embeddings=enc_output, input_mask=y_mask, embeddings_mask=x_mask)
# print(dec_output.shape) # (1, 512, 10000)

In [10]:
other_params = 0
trainable_params = 0
for num in t_model.parameters():
    if num.requires_grad:
        trainable_params += num.numel()
    else:
        other_params += num.numel()

print(f"==>Trainable Parameters: {trainable_params}")
print(f"==>Other Parameters: {other_params}")

==>Trainable Parameters: 91920028
==>Other Parameters: 0


l_model

In [15]:
other_params = 0
trainable_params = 0
for num in l_model.parameters():
    if num.requires_grad:
        trainable_params += num.numel()
    else:
        other_params += num.numel()

print(f"==>Trainable Parameters: {trainable_params}")
print(f"==>Other Parameters: {other_params}")

==>Trainable Parameters: 92511004
==>Other Parameters: 0


###### from linformer_pytorch import LinformerLM

encoder = LinformerLM(
    num_tokens=len(train_infield.vocab),
    input_size=batch,
    channels=emb_dim,
    dim_k=128,
    dim_ff=2048,
    nhead=heads,
    depth=n_layers,
    activation="relu",
    k_reduce_by_layer=1,
    return_emb=True,
    )
decoder = LinformerLM(
    num_tokens=len(train_outfield.vocab),
    input_size=batch,
    channels=emb_dim,
    dim_k=128,
    dim_ff=2048,
    nhead=heads,
    depth=n_layers,
    activation="relu",
    decoder_mode=True,
    )

x = torch.randint(1,10000,(1,batch))
y = torch.randint(1,10000,(1,batch))
print(x)
print(x.shape)
print(y.shape)
x_mask = torch.ones_like(x).bool()
y_mask = torch.ones_like(y).bool()
print(x_mask.shape)
print(y_mask.shape)
enc_output = encoder(x, input_mask=x_mask)
print(enc_output.shape) # (1, 512, 128)
dec_output = decoder(y, embeddings=enc_output, input_mask=y_mask, embeddings_mask=x_mask)
print(dec_output.shape) # (1, 512, 10000)

In [None]:
# lr = 0
lr = 0.0003

In [None]:
# t_optimizer = torch.optim.Adam(t_model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
l_optimizer = torch.optim.Adam(l_model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

In [None]:
scheduler_name = "plateau"
# t_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(t_optimizer, 'min', factor=0.9, patience=3, verbose=verbose)
l_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(l_optimizer, 'min', factor=0.9, patience=3, verbose=verbose)

In [114]:
scheduler_name = "cosine"
t_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(t_optimizer, num_batches(train_data_iter), T_mult=1, eta_min=0, last_epoch=-1, verbose=True)
l_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(l_optimizer, num_batches(train_data_iter), T_mult=1, eta_min=0, last_epoch=-1, verbose=True)
# t_scheduler = CosineWithRestarts(t_optimizer, T_max=num_batches(train_data_iter), verbose=verbose)
# l_scheduler = CosineWithRestarts(l_optimizer, T_max=num_batches(train_data_iter), verbose=verbose)

Epoch     0: adjusting learning rate of group 0 to 3.0000e-04.
Epoch     0: adjusting learning rate of group 0 to 3.0000e-04.


In [63]:
scheduler_name = "warmup"
t_scheduler = AdamWarmup(model_size = emb_dim, warmup_steps = 4000, optimizer = t_optimizer, verbose=verbose)
t_scheduler.print_lr()
l_scheduler = AdamWarmup(model_size = emb_dim, warmup_steps = 4000, optimizer = l_optimizer, verbose=verbose)
l_scheduler.print_lr()

Adjusting learning rate to 0.
Adjusting learning rate to 0.


In [42]:
#plateau 0.01
transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.000m: train epoch *1*, loss = *4.017*, 0.000m: test epoch *1*, loss = *4.104*, best loss = *4.017*
1.000m: train epoch *2*, loss = *3.492*, 1.000m: test epoch *2*, loss = *3.939*, best loss = *3.492*
1.000m: train epoch *3*, loss = *3.379*, 1.000m: test epoch *3*, loss = *4.027*, best loss = *3.379*
2.000m: train epoch *4*, loss = *3.310*, 2.000m: test epoch *4*, loss = *4.200*, best loss = *3.310*
2.000m: train epoch *5*, loss = *3.281*, 2.000m: test epoch *5*, loss = *4.054*, best loss = *3.281*
3.000m: train epoch *6*, loss = *3.243*, 3.000m: test epoch *6*, loss = *4.065*, best loss = *3.243*
4.000m: train epoch *7*, loss = *3.237*, 4.000m: test epoch *7*, loss = *4.131*, best loss = *3.237*
4.000m: train epoch *8*, loss = *3.247*, 4.000m: test epoch *8*, loss = *4.140*, best loss = *3.237*
5.000m: train epoch *9*, loss = *3.220*, 5.000m: test epoch *9*, loss = *4.090*, best loss = *3.220*
5.000m: train epoch *10*, loss = *3.22

KeyboardInterrupt: 

In [56]:
#plateau 0.001
transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.000m: train epoch *1*, loss = *3.511*, 0.000m: test epoch *1*, loss = *4.106*, best loss = *3.511*
0.000m: train epoch *2*, loss = *2.911*, 0.000m: test epoch *2*, loss = *4.157*, best loss = *2.911*
0.000m: train epoch *3*, loss = *2.638*, 0.000m: test epoch *3*, loss = *4.352*, best loss = *2.638*
1.000m: train epoch *4*, loss = *2.404*, 1.000m: test epoch *4*, loss = *4.457*, best loss = *2.404*
1.000m: train epoch *5*, loss = *2.179*, 1.000m: test epoch *5*, loss = *4.656*, best loss = *2.179*
1.000m: train epoch *6*, loss = *1.963*, 1.000m: test epoch *6*, loss = *4.859*, best loss = *1.963*
2.000m: train epoch *7*, loss = *1.770*, 2.000m: test epoch *7*, loss = *5.036*, best loss = *1.770*
2.000m: train epoch *8*, loss = *1.584*, 2.000m: test epoch *8*, loss = *5.204*, best loss = *1.584*
2.000m: train epoch *9*, loss = *1.409*, 2.000m: test epoch *9*, loss = *5.453*, best loss = *1.409*
2.000m: train epoch *10*, loss = *1.26

In [60]:
#cosine
transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.000m: train epoch *1*, loss = *0.255*, 0.000m: test epoch *1*, loss = *8.522*, best loss = *0.255*
0.000m: train epoch *2*, loss = *0.226*, 0.000m: test epoch *2*, loss = *8.750*, best loss = *0.226*
0.000m: train epoch *3*, loss = *0.225*, 0.000m: test epoch *3*, loss = *8.898*, best loss = *0.225*
1.000m: train epoch *4*, loss = *0.219*, 1.000m: test epoch *4*, loss = *9.049*, best loss = *0.219*
1.000m: train epoch *5*, loss = *0.225*, 1.000m: test epoch *5*, loss = *9.135*, best loss = *0.219*
1.000m: train epoch *6*, loss = *0.225*, 1.000m: test epoch *6*, loss = *9.247*, best loss = *0.219*
2.000m: train epoch *7*, loss = *0.225*, 2.000m: test epoch *7*, loss = *9.259*, best loss = *0.219*
2.000m: train epoch *8*, loss = *0.225*, 2.000m: test epoch *8*, loss = *9.234*, best loss = *0.219*
2.000m: train epoch *9*, loss = *0.224*, 2.000m: test epoch *9*, loss = *9.505*, best loss = *0.219*
2.000m: train epoch *10*, loss = *0.22

In [65]:
#warmup 0.001
transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
Epoch 00247: adjusting learning rate to 5.520292819452606e-05.
0.000m: train epoch *1*, loss = *0.113*, 0.000m: test epoch *1*, loss = *9.549*, best loss = *0.113*
Epoch 00248: adjusting learning rate to 9.85267452585845e-05.
0.000m: train epoch *2*, loss = *0.122*, 0.000m: test epoch *2*, loss = *9.400*, best loss = *0.113*
Epoch 00249: adjusting learning rate to 0.00014185056232264294.
1.000m: train epoch *3*, loss = *0.131*, 1.000m: test epoch *3*, loss = *9.366*, best loss = *0.113*
Epoch 00250: adjusting learning rate to 0.00018517437938670136.
1.000m: train epoch *4*, loss = *0.137*, 1.000m: test epoch *4*, loss = *9.278*, best loss = *0.113*
Epoch 00251: adjusting learning rate to 0.0002284981964507598.
1.000m: train epoch *5*, loss = *0.145*, 1.000m: test epoch *5*, loss = *9.284*, best loss = *0.113*
Epoch 00252: adjusting learning rate to 0.00027182201351481824.
2.000m: train epoch *6*, loss = *0.149*, 2.000m: test epoch *6

#test
transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

In [19]:
#plateau - 1 layer
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.000m: train epoch *1*, loss = *2.784*, 0.000m: test epoch *1*, loss = *4.599*, best loss = *2.784*
0.000m: train epoch *2*, loss = *2.786*, 0.000m: test epoch *2*, loss = *4.585*, best loss = *2.784*
0.000m: train epoch *3*, loss = *2.770*, 0.000m: test epoch *3*, loss = *4.618*, best loss = *2.770*
0.000m: train epoch *4*, loss = *2.798*, 0.000m: test epoch *4*, loss = *4.620*, best loss = *2.770*
1.000m: train epoch *5*, loss = *2.779*, 1.000m: test epoch *5*, loss = *4.604*, best loss = *2.770*
1.000m: train epoch *6*, loss = *2.779*, 1.000m: test epoch *6*, loss = *4.571*, best loss = *2.770*
Epoch     7: reducing learning rate of group 0 to 3.0902e-04.
1.000m: train epoch *7*, loss = *2.780*, 1.000m: test epoch *7*, loss = *4.596*, best loss = *2.770*
1.000m: train epoch *8*, loss = *2.769*, 1.000m: test epoch *8*, loss = *4.633*, best loss = *2.769*
1.000m: train epoch *9*, loss = *2.777*, 1.000m: test epoch *9*, loss = *4.61

In [13]:
#plateau - 6 layers -0.001
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
1.000m: train epoch *1*, loss = *3.648*, 1.000m: test epoch *1*, loss = *3.421*, best loss = *3.648*
2.000m: train epoch *2*, loss = *4.024*, 2.000m: test epoch *2*, loss = *3.824*, best loss = *3.648*
3.000m: train epoch *3*, loss = *4.051*, 3.000m: test epoch *3*, loss = *3.827*, best loss = *3.648*
4.000m: train epoch *4*, loss = *4.040*, 4.000m: test epoch *4*, loss = *3.811*, best loss = *3.648*
Epoch     5: reducing learning rate of group 0 to 9.0000e-04.
5.000m: train epoch *5*, loss = *4.032*, 5.000m: test epoch *5*, loss = *3.816*, best loss = *3.648*
6.000m: train epoch *6*, loss = *4.024*, 6.000m: test epoch *6*, loss = *3.810*, best loss = *3.648*
7.000m: train epoch *7*, loss = *4.017*, 7.000m: test epoch *7*, loss = *3.824*, best loss = *3.648*
8.000m: train epoch *8*, loss = *4.015*, 8.000m: test epoch *8*, loss = *3.846*, best loss = *3.648*
Epoch     9: reducing learning rate of group 0 to 8.1000e-04.
9.000m: train e

In [21]:
#cosine 0.001
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.000m: train epoch *1*, loss = *2.780*, 0.000m: test epoch *1*, loss = *4.643*, best loss = *2.780*
0.000m: train epoch *2*, loss = *2.770*, 0.000m: test epoch *2*, loss = *4.711*, best loss = *2.770*
0.000m: train epoch *3*, loss = *2.776*, 0.000m: test epoch *3*, loss = *4.743*, best loss = *2.770*
0.000m: train epoch *4*, loss = *2.799*, 0.000m: test epoch *4*, loss = *4.686*, best loss = *2.770*
1.000m: train epoch *5*, loss = *2.786*, 1.000m: test epoch *5*, loss = *4.719*, best loss = *2.770*
1.000m: train epoch *6*, loss = *2.775*, 1.000m: test epoch *6*, loss = *4.717*, best loss = *2.770*
1.000m: train epoch *7*, loss = *2.771*, 1.000m: test epoch *7*, loss = *4.689*, best loss = *2.770*
1.000m: train epoch *8*, loss = *2.793*, 1.000m: test epoch *8*, loss = *4.678*, best loss = *2.770*
1.000m: train epoch *9*, loss = *2.787*, 1.000m: test epoch *9*, loss = *4.691*, best loss = *2.770*
2.000m: train epoch *10*, loss = *2.78

KeyboardInterrupt: 

In [None]:
#plateau - 6 layers -0.0003 smalldata
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
28.000m: train epoch *1*, loss = *0.359*, 28.000m: test epoch *1*, loss = *0.162*, best loss = *0.359*
57.000m: train epoch *2*, loss = *0.276*, 57.000m: test epoch *2*, loss = *0.183*, best loss = *0.276*
85.000m: train epoch *3*, loss = *0.275*, 85.000m: test epoch *3*, loss = *0.172*, best loss = *0.275*
114.000m: train epoch *4*, loss = *0.272*, 114.000m: test epoch *4*, loss = *0.165*, best loss = *0.272*
143.000m: train epoch *5*, loss = *0.273*, 143.000m: test epoch *5*, loss = *0.166*, best loss = *0.272*
172.000m: train epoch *6*, loss = *0.277*, 172.000m: test epoch *6*, loss = *0.174*, best loss = *0.272*


In [None]:
#plateau - 2 layers -0.0003 data
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
618.403m: train epoch *1*, loss = *0.376*, 618.404m: test epoch *1*, loss = *0.165*, best loss = *0.376*
1236.832m: train epoch *2*, loss = *0.277*, 1236.833m: test epoch *2*, loss = *0.181*, best loss = *0.277*
1854.888m: train epoch *3*, loss = *0.275*, 1854.889m: test epoch *3*, loss = *0.173*, best loss = *0.275*
2472.388m: train epoch *4*, loss = *0.275*, 2472.388m: test epoch *4*, loss = *0.168*, best loss = *0.275*
3090.095m: train epoch *5*, loss = *0.277*, 3090.095m: test epoch *5*, loss = *0.172*, best loss = *0.275*
3708.625m: train epoch *6*, loss = *0.280*, 3708.626m: test epoch *6*, loss = *0.170*, best loss = *0.275*
Epoch     7: reducing learning rate of group 0 to 2.7000e-04.
4325.161m: train epoch *7*, loss = *0.281*, 4325.162m: test epoch *7*, loss = *0.167*, best loss = *0.275*
4942.032m: train epoch *8*, loss = *0.284*, 4942.032m: test epoch *8*, loss = *0.172*, best loss = *0.275*
5559.489m: train epoch *9*, los

In [None]:
#plateau - 6 layers -0.0003 data
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

In [None]:
#plateau - 6 layers -0.0003 data
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
29.610m: train epoch *1*, loss = *0.355*, 29.610m: test epoch *1*, loss = *0.164*, best loss = *0.355*
58.754m: train epoch *2*, loss = *0.276*, 58.754m: test epoch *2*, loss = *0.179*, best loss = *0.276*
88.426m: train epoch *3*, loss = *0.274*, 88.426m: test epoch *3*, loss = *0.169*, best loss = *0.274*
117.689m: train epoch *4*, loss = *0.271*, 117.689m: test epoch *4*, loss = *0.169*, best loss = *0.271*
147.713m: train epoch *5*, loss = *0.274*, 147.713m: test epoch *5*, loss = *0.170*, best loss = *0.271*
176.758m: train epoch *6*, loss = *0.277*, 176.758m: test epoch *6*, loss = *0.169*, best loss = *0.271*
206.233m: train epoch *7*, loss = *0.280*, 206.233m: test epoch *7*, loss = *0.171*, best loss = *0.271*
Epoch     8: reducing learning rate of group 0 to 2.7000e-04.
236.419m: train epoch *8*, loss = *0.286*, 236.419m: test epoch *8*, loss = *0.178*, best loss = *0.271*


In [38]:

transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.079236secs: train iter *1*, train size 1 *22*, train size 2 *113*
0.026605secs: train iter *2*, train size 1 *10*, train size 2 *13*
0.025490secs: train iter *3*, train size 1 *10*, train size 2 *10*
0.062034secs: train iter *4*, train size 1 *19*, train size 2 *91*
0.026649secs: train iter *5*, train size 1 *11*, train size 2 *16*
0.039931secs: train iter *6*, train size 1 *32*, train size 2 *38*
0.025368secs: train iter *7*, train size 1 *5*, train size 2 *6*
0.074836secs: train iter *8*, train size 1 *7*, train size 2 *121*
0.054698secs: train iter *9*, train size 1 *56*, train size 2 *62*
0.025429secs: train iter *10*, train size 1 *9*, train size 2 *7*
0.025245secs: train iter *11*, train size 1 *8*, train size 2 *7*
0.071464secs: train iter *12*, train size 1 *14*, train size 2 *112*
0.025408secs: train iter *13*, train size 1 *9*, train size 2 *12*
0.054296secs: train iter *14*, train size 1 *18*, train size 2 *77*
0.043122s

KeyboardInterrupt: 

In [15]:
#plateau - 6 layers -0.0003 data
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.307968secs: train iter *1*, train size 1 *19*, train size 2 *18*
0.272683secs: train iter *2*, train size 1 *18*, train size 2 *19*
0.248318secs: train iter *3*, train size 1 *6*, train size 2 *22*
0.236382secs: train iter *4*, train size 1 *8*, train size 2 *20*
0.237868secs: train iter *5*, train size 1 *6*, train size 2 *7*
0.242065secs: train iter *6*, train size 1 *35*, train size 2 *34*
0.236939secs: train iter *7*, train size 1 *10*, train size 2 *10*
0.256302secs: train iter *8*, train size 1 *24*, train size 2 *43*
0.238596secs: train iter *9*, train size 1 *9*, train size 2 *8*
0.238395secs: train iter *10*, train size 1 *12*, train size 2 *13*
0.302706secs: train iter *11*, train size 1 *20*, train size 2 *64*
0.271135secs: train iter *12*, train size 1 *26*, train size 2 *63*
0.238793secs: train iter *13*, train size 1 *7*, train size 2 *6*
0.240233secs: train iter *14*, train size 1 *4*, train size 2 *14*
0.237905secs:

KeyboardInterrupt: 

In [18]:

transformer_trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
0.109582secs: train iter *1*, train size 1 *19*, train size 2 *74*
0.068334secs: train iter *2*, train size 1 *6*, train size 2 *17*
0.068595secs: train iter *3*, train size 1 *6*, train size 2 *10*
0.068183secs: train iter *4*, train size 1 *11*, train size 2 *9*
0.104889secs: train iter *5*, train size 1 *4*, train size 2 *71*
0.143829secs: train iter *6*, train size 1 *11*, train size 2 *119*
0.067831secs: train iter *7*, train size 1 *4*, train size 2 *15*
0.068091secs: train iter *8*, train size 1 *9*, train size 2 *15*
0.068008secs: train iter *9*, train size 1 *24*, train size 2 *25*
0.068451secs: train iter *10*, train size 1 *9*, train size 2 *24*
0.109009secs: train iter *11*, train size 1 *12*, train size 2 *76*
0.088395secs: train iter *12*, train size 1 *42*, train size 2 *37*
0.068309secs: train iter *13*, train size 1 *10*, train size 2 *23*
0.123246secs: train iter *14*, train size 1 *18*, train size 2 *92*
0.067446se

KeyboardInterrupt: 

In [12]:
def trainer(model, train_data_iterator, train_options, test_data_iterator, test_options, optimizer, scheduler, scheduler_name):

    if torch.cuda.is_available() and train_options.device == torch.device("cuda"):
        print("a GPU was detected, model will be trained on GPU")
        model = model.cuda()
    else:
        print("training on cpu")

    model.train()
    start = time.time()
    best_loss = 100
    
    iters = len(train_data_iterator)
    for epoch in range(train_options.epochs):
        train_total_loss = 0
        for i, batch in enumerate(train_data_iterator): 
            if i > 0: raise KeyboardInterrupt
            src = batch.listen.transpose(0,1)
#             print("src", src)
            trg = batch.reply.transpose(0,1)
#             print("src", src.shape)
#             print("trg", trg)
#             print("trg", trg.shape)
            trg_input = trg[:, :-1]
#             print("trg_input shape", trg_input.shape)
#             print("trg_input", trg_input)
        
            src_mask, trg_mask = create_masks(src, trg_input, train_options)
#             print("src_mask", src_mask)
#             print("trg_mask", trg_mask)
            preds = model(src, src_mask, trg_input, trg_mask)
            print(preds)
            print("pred :",preds)
            ys = trg[:, 1:].contiguous().view(-1)
            print("ys shape", ys.shape)
            print("preds shape1", preds.shape)
#             print("preds view:", preds.view(-1, preds.size(-1)))
            print("preds shape2", preds.view(-1, preds.size(-1)).shape)
            optimizer.zero_grad()
            train_batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                         ys, ignore_index = train_options.trg_pad)
            train_batch_loss.backward()
            optimizer.step()
            train_total_loss += train_batch_loss.item()
            if scheduler_name == "cosine":
                #print("cosine")
                scheduler.step(epoch + i / iters)
            if scheduler_name == "warmup": 
                scheduler.step()
            print("batch loss", train_batch_loss)
            
            

        train_epoch_loss = train_total_loss/(num_batches(train_data_iterator)+1)

        model.eval()
        test_total_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(test_data_iterator): 
                src = batch.listen.transpose(0,1)
                trg = batch.reply.transpose(0,1)
                trg_input = trg[:, :-1]
                src_mask, trg_mask = create_masks(src, trg_input, test_options)
                preds = model(src, src_mask, trg_input, trg_mask)
                ys = trg[:, 1:].contiguous().view(-1)
                test_batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                             ys, ignore_index = test_options.trg_pad)
                test_total_loss += test_batch_loss.item()

            test_epoch_loss = test_total_loss/(num_batches(test_data_iterator)+1)
        
        model.train()
        if scheduler_name == "plateau":
            scheduler.step(train_epoch_loss)
        # scheduler.step(test_epoch_loss)
        

        # if train_epoch_loss < best_loss:
        #     best_loss = train_epoch_loss
        #     torch.save(model.state_dict(), train_options.save_path)
        if test_epoch_loss < best_loss:
            best_loss = test_epoch_loss
            torch.save(model.state_dict(), train_options.save_path)
        print("%.3fm: train epoch *%d*, loss = *%.3f*" %((time.time() - start)//60, epoch, train_epoch_loss), end=", ")
        print("%.3fm: test epoch *%d*, loss = *%.3f*, best loss = *%.3f*" %((time.time() - start)//60, epoch, test_epoch_loss, best_loss) , flush=True)
        train_total_loss = 0
        test_total_loss = 0

In [15]:
trainer(t_model, train_data_iter, train_opt, test_data_iter, test_opt, t_optimizer, t_scheduler, scheduler_name)

a GPU was detected, model will be trained on GPU
tensor([[[-0.1844, -0.5469, -0.0845,  ...,  1.2630, -0.0519,  0.7461],
         [-0.3253, -0.6089,  0.7874,  ..., -0.8720, -0.6310,  0.5250],
         [-0.2574, -0.3602,  0.4945,  ...,  0.7115,  0.6852, -0.5288],
         ...,
         [ 0.4672, -0.7800, -0.1505,  ...,  0.1785,  0.6682, -1.2653],
         [-0.0376, -1.2615,  0.3215,  ...,  0.3061,  1.0878, -1.2897],
         [ 0.1071, -1.1693,  0.0674,  ...,  0.6054,  0.9830, -1.5651]],

        [[-0.2895, -0.4937,  0.2230,  ...,  1.2742, -0.3471,  0.7206],
         [-0.1099, -0.4696,  0.2041,  ..., -0.5249, -0.6263,  0.4733],
         [-0.2040, -1.0922,  0.5240,  ...,  0.9412, -0.4860,  0.1810],
         ...,
         [ 0.0442, -1.0805,  0.2585,  ...,  0.6251,  0.6942, -1.3362],
         [ 0.1210, -1.5012,  0.4226,  ...,  0.5150,  1.1843, -1.6243],
         [ 0.4514, -1.1460,  0.3091,  ...,  0.4535,  1.0736, -1.5753]],

        [[-0.5722, -0.4208, -0.3433,  ...,  1.3380, -0.2040,  0.633

KeyboardInterrupt: 

In [26]:
create_masks

39

In [13]:
train_opt.src_pad

1

t_model.train()

In [19]:
def trainer2(model, train_data_iterator, train_options, test_data_iterator, test_options, optimizer, scheduler, scheduler_name):

    if torch.cuda.is_available() and train_options.device == torch.device("cuda"):
        print("==> a GPU was detected, model will be trained on GPU")
        model = model.cuda()
    else:
        print("==> training on cpu")

    model.train()
    start = time.monotonic()

    best_loss = 100
    iters = len(train_data_iterator)
    for epoch in range(train_options.epochs):
        train_total_loss = 0
        for i, batch in enumerate(train_data_iterator):
            print(f" Batch {i} of {len(train_data_iterator)}", end="")
            each_iter = time.monotonic()
            src = batch.listen.transpose(0,1)
            trg = batch.reply.transpose(0,1)
#             print("trg ", trg)
#             trg = trg[:, :-1]
#             print("trg", trg)
#             print("normal src", src)
#             print("normal trg", trg)
#             print("src shape", src.shape)
#             print("trg shape", trg.shape)
#             print("src len: ", len(src))
#             print("trg len: ", len(trg))
            _, src_seq_len = src.shape
            if src_seq_len < train_options.max_len:
                src_diff = abs(src_seq_len - train_options.max_len)
                src = torch.nn.functional.pad(input=src, pad=(0,src_diff,0,0), mode='constant', value=train_options.trg_pad)
            elif src_seq_len > train_options.max_len:
#                 src = src.narrow(1,0,train_options.max_len)
#                 print("narrow src shape", src.shape)
#                 print("removed batch size too long")
                continue
            _, trg_seq_len = trg.shape
            if trg_seq_len < train_options.max_len:
                trg_diff = abs(trg_seq_len - train_options.max_len)
                trg = torch.nn.functional.pad(input=trg, pad=(0,trg_diff,0,0), mode='constant', value=train_options.trg_pad)
            elif trg_seq_len > train_options.max_len:
#                 trg = trg.narrow(1,0,train_options.max_len)
#                 print("narrow trg shape", trg.shape) 
#                 print("removed batch size too long")
                continue
#             print("normal src", src)
#             print("padded trg", trg)
#             print("src shape", src.shape)
#             print("trg shape", trg.shape)            
#             diff = abs(len(batch.listen) - len(batch.reply))
#             if len(batch.listen) > len(batch.reply):
#                 trg = torch.nn.functional.pad(input=trg, pad=(0,0,0,diff), mode='constant', value=train_options.trg_pad)
#             elif len(batch.listen) < len(batch.reply):
#                 src = torch.nn.functional.pad(input=src, pad=(0,0,0,diff), mode='constant', value=train_options.src_pad)
                
            # print("src shape", src.shape)
            # if(diff>0):  
            #     print("src pad", src)
            #     print("trg pad", trg)
            #     new_src = batch.listen.transpose(0,1)
            #     new_trg = batch.reply.transpose(0,1)
            #     new_trg_input = trg[:, :-1]
            #     src_mark, trg_mask = create_masks(new_src,new_trg_input,train_options)
            #     print("src mark", src_mark)
            #     print("trg_mask", trg_mark)
            
#             diff = abs(train_options.batchsize - src.shape[1])
#             if train_options.batchsize > src.shape[1]:
#                 src = torch.nn.functional.pad(input=src, pad=(0,diff,0,0), mode='constant', value=train_options.src_pad)
#             if train_options.batchsize > trg.shape[1]:
#                 trg = torch.nn.functional.pad(input=trg, pad=(0,diff,0,0), mode='constant', value=train_options.trg_pad)
                
#             print("src shape", src.shape)

            preds = model(src,trg)
            
#             print("preds :", preds)
#             print("trg[:, 1;]", trg[:,1:].shape)
            ys = trg.contiguous().view(-1)
#             print("ys1 shape:", ys.shape)
#             print("ys1:", ys)
#             ys = trg[:, 1:].contiguous().view(-1)
#             print("ys shape2:", ys.shape)
#             print("ys2:", ys)
#             print("yes :", ys)
#             preds = preds.contiguous().view(-1, preds.size(-1))
                           
            optimizer.zero_grad()
            
#             print("preds size -1 :", preds.size(-1))
#             print("preds size [:, 1:] :", preds[:,1:].shape)
            preds = preds.view(-1, preds.size(-1))
#             print("preds shape1 :", preds.shape)
#             print("preds :", preds)
#             preds = preds[:, 1:].view(-1, preds.size(-1))
#             print("preds shape2 :", preds.shape)
    
            
            train_batch_loss = F.cross_entropy(preds, ys, ignore_index = train_options.trg_pad)
            train_batch_loss.backward()
            optimizer.step()
            train_total_loss += train_batch_loss.item()
            if scheduler_name == "cosine":
                scheduler.step(epoch + i / iters)
            if scheduler_name == "warmup": 
                scheduler.step()
#             print("batch loss", train_batch_loss)
#             print("%.6fsecs: train iter *%d*, train size 1 *%d*, train size 2 *%d*"  %(time.monotonic() - each_iter, i+1,len(batch.listen),len(batch.reply)))
            
        if scheduler_name == "warmup":
            scheduler.print_lr(epoch+i)
        train_epoch_loss = train_total_loss/(num_batches(train_data_iterator)+1)

        if scheduler_name == "plateau": 
            scheduler.step(train_epoch_loss) 

        model.eval()
        test_total_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(test_data_iterator): 
                src = batch.listen.transpose(0,1)
                trg = batch.reply.transpose(0,1)
                trg = trg[:, :-1]
                _, src_seq_len = src.shape
                if src_seq_len < train_options.max_len:
                    src_diff = abs(src_seq_len - test_options.max_len)
                    src = torch.nn.functional.pad(input=src, pad=(0,src_diff,0,0), mode='constant', value=test_options.trg_pad)
                elif src_seq_len > train_options.max_len:
#                     src = src.narrow(1,0,train_options.max_len)
                  continue
                  
                _, trg_seq_len = trg.shape
                if trg_seq_len < train_options.max_len:
                    trg_diff = abs(trg_seq_len - test_options.max_len)
                    trg = torch.nn.functional.pad(input=trg, pad=(0,trg_diff,0,0), mode='constant', value=test_options.trg_pad)
                elif trg_seq_len > train_options.max_len:
#                     trg = trg.narrow(1,0,train_options.max_len)
                  continue
#                 diff = abs(len(batch.listen) - len(batch.reply))
#                 if len(batch.listen) > len(batch.reply):
#                     trg = torch.nn.functional.pad(input=trg, pad=(0,0,0,diff), mode='constant', value=test_options.trg_pad)
#                 elif len(batch.listen) < len(batch.reply):
#                     src = torch.nn.functional.pad(input=src, pad=(0,0,0,diff), mode='constant', value=test_options.src_pad)
#                 # print("src shape", src.shape) 
#                 diff = abs(test_options.batchsize - src.shape[1])
#                 if test_options.batchsize > src.shape[1]:
#                     src = torch.nn.functional.pad(input=src, pad=(0,diff,0,0), mode='constant', value=test_options.src_pad)
#                 if test_options.batchsize > trg.shape[1]:
#                     trg = torch.nn.functional.pad(input=trg, pad=(0,diff,0,0), mode='constant', value=test_options.trg_pad)
                # print("src shape", src.shape)   
                         
                preds = model(src,trg)
#                 ys = trg[:, 1:].contiguous().view(-1)
#                 print("ys1 : ", ys)
                ys = trg.contiguous().view(-1)
#                 print("ys2 : ", ys)               
                preds = preds.view(-1, preds.size(-1))
#                 preds = preds.contiguous().view(-1, preds.size(-1))
                test_batch_loss = F.cross_entropy(preds, ys, ignore_index = test_options.trg_pad)
                test_total_loss += test_batch_loss.item()

            test_epoch_loss = test_total_loss/(num_batches(test_data_iterator)+1)

        # if scheduler_name == "plateau": 
        #     scheduler.step(test_epoch_loss) 

        model.train()

        # scheduler.step()

        if train_epoch_loss < best_loss:
            best_loss = train_epoch_loss
            torch.save(model.state_dict(), train_options.save_path)
        # if test_epoch_loss < best_loss:
        #     best_loss = test_epoch_loss
        #     torch.save(model.state_dict(), train_options.save_path)
        print("%.3fm: train epoch *%d*, loss = *%.3f*" %((time.monotonic() - start)/60, epoch+1, train_epoch_loss), end=", ")
        print("%.3fm: test epoch *%d*, loss = *%.3f*, best loss = *%.3f*" %((time.monotonic() - start)/60, epoch+1, test_epoch_loss, best_loss) , flush=True)
        train_total_loss = 0
        test_total_loss = 0

In [20]:
#plateau - 6 layers -0.0003 data
trainer2(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

==> a GPU was detected, model will be trained on GPU
 Batch 0 of 1556 Batch 1 of 1556 Batch 2 of 1556 Batch 3 of 1556 Batch 4 of 1556 Batch 5 of 1556 Batch 6 of 1556 Batch 7 of 1556 Batch 8 of 1556 Batch 9 of 1556 Batch 10 of 1556 Batch 11 of 1556 Batch 12 of 1556 Batch 13 of 1556 Batch 14 of 1556 Batch 15 of 1556 Batch 16 of 1556 Batch 17 of 1556 Batch 18 of 1556 Batch 19 of 1556 Batch 20 of 1556 Batch 21 of 1556 Batch 22 of 1556 Batch 23 of 1556 Batch 24 of 1556 Batch 25 of 1556 Batch 26 of 1556 Batch 27 of 1556 Batch 28 of 1556 Batch 29 of 1556 Batch 30 of 1556 Batch 31 of 1556 Batch 32 of 1556 Batch 33 of 1556 Batch 34 of 1556 Batch 35 of 1556 Batch 36 of 1556 Batch 37 of 1556 Batch 38 of 1556 Batch 39 of 1556 Batch 40 of 1556 Batch 41 of 1556 Batch 42 of 1556 Batch 43 of 1556 Batch 44 of 1556 Batch 45 of 1556 Batch 46 of 1556 Batch 47 of 1556 Batch 48 of 1556 Batch 49 of 1556 Batch 50 of 1556 Batch 51 of 1556 Batch 52 of 1556 Batch 53 of 1556 Batch 54 of 1556 Batch 55 of 1556 Batc

KeyboardInterrupt: 

## plateau - 6 layers -0.0003 data
linformer_trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

In [14]:
#plateau - 6 layers -0.0003 data
trainer(l_model, train_data_iter, train_opt, test_data_iter, test_opt, l_optimizer, l_scheduler, scheduler_name)

a GPU was detected, model will be trained on GPU
k shape torch.Size([128, 40, 512])
e shape torch.Size([128, 40, 256])
k view shape torch.Size([128, 40, 8, 64])
e view shape torch.Size([128, 40, 8, 32])
k transpose shape torch.Size([128, 8, 40, 64])
e transpose shape torch.Size([128, 8, 40, 32])
q, k torch.Size([128, 8, 40, 64]) torch.Size([128, 8, 64, 40])
torch.Size([128, 8, 40, 40])
scores.shape torch.Size([128, 8, 40, 32])
mask.shape torch.Size([128, 1, 1, 40])


RuntimeError: The size of tensor a (40) must match the size of tensor b (32) at non-singleton dimension 3