In [2]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.utils.data
import math
import torch.nn.functional as F
import neptune
import pandas as pd

In [3]:
knowledgebase_url = 'https://github.com/AndiAlifs/FLUENT-Chatbot-2023/raw/main/KnowledgeBaseFilkom.xlsx'
knowledgebase = pd.read_excel(knowledgebase_url)

qa_paired = knowledgebase.drop(columns=knowledgebase.columns.drop(['Pertanyaan', 'Jawaban']))
qa_paired.dropna(inplace=True)
qa_paired

Unnamed: 0,Pertanyaan,Jawaban
0,email Fitra A. Bachtiar,fitra.bachtiar[at]ub.ac.id
1,NIK/NIP Fitra A. Bachtiar,198406282019031006
2,nama lengkap Fitra A. Bachtiar,Dr.Eng. Fitra A. Bachtiar
3,Departemen Fitra A. Bachtiar,Departemen Teknik Informatika
4,Program Studi Fitra A. Bachtiar,S2 Ilmu Komputer
...,...,...
1229,Apa Manfaat Konseling FILKOM ?,1. Masalah ditangani oleh ahli yang kompeten d...
1230,Berikan informasi mengenai Layanan Konseling,Informasi mengenai Layanan Konseling dapat dia...
1231,Siapa Konselor Bimbingan dan Konseling di FILK...,Ada 2 konselor Bimbingan dan Konseling di FILK...
1232,Siapa Koordinator Konselor Sebaya ?,Koordinator Konselor Sebaya adalah Muhammad Da...


In [4]:
def remove_punc(string):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in string:
        if char not in punctuations:
            no_punct = no_punct + char  # space is also a character
    return no_punct.lower()

In [5]:
pairs = []
max_len = 90

for line in qa_paired.iterrows():
    pertanyaan = line[1]['Pertanyaan']
    jawaban = line[1]['Jawaban']
    qa_pairs = []
    first = remove_punc(pertanyaan.strip())      
    second = remove_punc(jawaban.strip())
    qa_pairs.append(first.split()[:max_len])
    qa_pairs.append(second.split()[:max_len])
    pairs.append(qa_pairs)

In [6]:
word_freq = Counter()
for pair in pairs:
    word_freq.update(pair[0])
    word_freq.update(pair[1])

In [7]:
min_word_freq = 2
words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v + 1 for v, k in enumerate(words)}
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

In [8]:
print("Total words are: {}".format(len(word_map)))

Total words are: 1079


In [9]:
with open('WORDMAP_corpus_KBFILKOM.json', 'w') as j:
    json.dump(word_map, j)

In [10]:
def encode_question(words, word_map):
    enc_c = [word_map.get(word, word_map['<unk>']) for word in words] + [word_map['<pad>']] * (max_len - len(words))
    return enc_c

def encode_reply(words, word_map):
    enc_c = [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in words] + \
    [word_map['<end>']] + [word_map['<pad>']] * (max_len - len(words))
    return enc_c

In [11]:
pairs_encoded = []
for pair in pairs:
    qus = encode_question(pair[0], word_map)
    ans = encode_reply(pair[1], word_map)
    pairs_encoded.append([qus, ans])

with open('pairs_encoded_kbfilkom.json', 'w') as p:
    json.dump(pairs_encoded, p)

In [12]:
rev_word_map = {v: k for k, v in word_map.items()}
' '.join([rev_word_map[v] for v in pairs_encoded[15][0]])

'apa tujuan filkom <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [13]:
class Dataset(Dataset):

    def __init__(self):

        self.pairs = json.load(open('pairs_encoded_kbfilkom.json'))
        self.dataset_size = len(self.pairs)

    def __getitem__(self, i):
        
        question = torch.LongTensor(self.pairs[i][0])
        reply = torch.LongTensor(self.pairs[i][1])
            
        return question, reply

    def __len__(self):
        return self.dataset_size

## Train Loader

In [14]:
train_loader = torch.utils.data.DataLoader(Dataset(),
                                           batch_size = 100, 
                                           shuffle=True, 
                                           pin_memory=True)

In [15]:
def create_masks(question, reply_input, reply_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)
    
    question_mask = question!=0
    question_mask = question_mask.to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data) 
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)
    
    return question_mask, reply_input_mask, reply_target_mask

In [16]:
import os

def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory created at {path}")
    else:
        print(f"Directory already exists at {path}")

# Architecture

## Embeddings

In [17]:
class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, d_model, max_len = 50, num_layers = 6):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)     # (1, max_len, d_model)
        self.te = self.create_positinal_encoding(num_layers, self.d_model)  # (1, num_layers, d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, embedding, layer_idx):
        if layer_idx == 0:
            embedding = self.embed(embedding) * math.sqrt(self.d_model)
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        # embedding: (batch_size, max_len, d_model), te: (batch_size, 1, d_model)
        embedding += self.te[:, layer_idx, :].unsqueeze(1).repeat(1, embedding.size(1), 1)
        embedding = self.dropout(embedding)
        return embedding


## Multi-Head Attention

In [18]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted 

## Feed Forward Neural Network

In [19]:
class FeedForward(nn.Module):
    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out


## Encoder

In [20]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded
        

## Decoder

In [21]:
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

## Transformer Architecture

In [22]:
class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, word_map, max_len = 50):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        self.vocab_size = len(word_map)
        self.embed = Embeddings(self.vocab_size, d_model, num_layers = num_layers, max_len = max_len)
        self.encoder = EncoderLayer(d_model, heads) 
        self.decoder = DecoderLayer(d_model, heads)
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_embeddings, src_mask):
        for i in range(self.num_layers):
            src_embeddings = self.embed(src_embeddings, i)
            src_embeddings = self.encoder(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, tgt_embeddings, target_mask, src_embeddings, src_mask):
        for i in range(self.num_layers):
            tgt_embeddings = self.embed(tgt_embeddings, i)
            tgt_embeddings = self.decoder(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
        
    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out


In [23]:
class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
        
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
        
    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()       
        

In [24]:
class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss
        

# Define Neptune Experiment

In [25]:
project = "andialifs/fluent-tesis-24"
api_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjZTY2YWQ3My04OTBkLTQ2OWUtYTc1Ni1jYjk0MGZhMWFiNGEifQ=="

def neptune_init(name):
    run = neptune.init_run(
        project=project,
        api_token=api_token,
        name=name
    )
    return run

# Function

## Train

In [26]:
def train(train_loader, transformer, criterion, epoch):
    transformer.train()
    sum_loss = 0
    count = 0

    for i, (question, reply) in enumerate(train_loader):
        
        samples = question.shape[0]

        # Move to device
        question = question.to(device)
        reply = reply.to(device)

        # Prepare Target Data
        reply_input = reply[:, :-1]
        reply_target = reply[:, 1:]

        # Create mask and add dimensions
        question_mask, reply_input_mask, reply_target_mask = create_masks(question, reply_input, reply_target)

        # Get the transformer outputs
        out = transformer(question, question_mask, reply_input, reply_input_mask)

        # Compute the loss
        loss = criterion(out, reply_target, reply_target_mask)
        
        # Backprop
        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()
        
        sum_loss += loss.item() * samples
        count += samples
        
        if i % 100 == 0:
            print("Epoch [{}][{}/{}]\tLoss: {:.3f}".format(epoch, i, len(train_loader), sum_loss/count))
    
    return sum_loss/count

## Evaluate

In [27]:
def evaluate(transformer, question, question_mask, max_len, word_map):
    """
    Performs Greedy Decoding with a batch size of 1
    """
    rev_word_map = {v: k for k, v in word_map.items()}
    transformer.eval()
    start_token = word_map['<start>']
    encoded = transformer.encode(question, question_mask)
    words = torch.LongTensor([[start_token]]).to(device)
    
    for step in range(max_len - 1):
        size = words.shape[1]
        target_mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        target_mask = target_mask.to(device).unsqueeze(0).unsqueeze(0)
        decoded = transformer.decode(words, target_mask, encoded, question_mask)
        predictions = transformer.logit(decoded[:, -1])
        _, next_word = torch.max(predictions, dim = 1)
        next_word = next_word.item()
        if next_word == word_map['<end>']:
            break
        words = torch.cat([words, torch.LongTensor([[next_word]]).to(device)], dim = 1)   # (1,step+2)
        
    # Construct Sentence
    if words.dim() == 2:
        words = words.squeeze(0)
        words = words.tolist()
        
    sen_idx = [w for w in words if w not in {word_map['<start>']}]
    sentence = ' '.join([rev_word_map[sen_idx[k]] for k in range(len(sen_idx))])
    
    return sentence

# Experiment

## Transformers without reg

In [40]:
d_model = [512, 1024, 2048, 4096]
heads = [8, 16, 32]
num_layers = [5, 10]
epochs = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)

transformer_experiment = pd.DataFrame(columns = ['experiment_id', 'd_model', 'heads', 'num_layers', 'train_loss'])
loss_history = {}

experiment_id = -1

for d_m in d_model:
    for h in heads:
        for n_l in num_layers: 
            parameters = {
                'd_model': d_m,
                'heads': h,
                'num_layers': n_l
            }
            
            experiment_id += 1
            
            if experiment_id <= 14:
                print('\nSkipping experiment {} with d_model {}, heads{}, num_layers{}\n'.format(experiment_id, d_m, h, n_l))
                continue
            
            print('\nRunning for experiment {} with d_model {}, heads{}, num_layers{}\n'.format(experiment_id, d_m, h, n_l))
            name = "experiment_2724_noreg_" + str(experiment_id)

            run = neptune_init(name)
            run['parameters'] = parameters
            run['tags'] = "transformers-vanilla-no-reg"
            
            transformer_experiment.loc[experiment_id, 'experiment_id'] = 'experiment_{}'.format(str(experiment_id))
            transformer_experiment.loc[experiment_id, 'd_model'] = d_m
            transformer_experiment.loc[experiment_id, 'heads'] = h
            transformer_experiment.loc[experiment_id, 'num_layers'] = n_l

            transformer = Transformer(d_model = d_m, heads = h, num_layers = n_l, word_map = word_map, max_len=95)
            transformer = transformer.to(device)
            adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
            transformer_optimizer = AdamWarmup(model_size = d_m, warmup_steps = 4000, optimizer = adam_optimizer)
            criterion = LossWithLS(len(word_map), 0.2)

            loss_list_experiment = []
            for epoch in range(epochs):
                loss_train = train(train_loader, transformer, criterion, epoch)

                state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
                # torch.save(state, 'checkpoint_experiment_' + str(epoch) + 'id_'+ str(experiment_id) +'.pth.tar')

                loss_list_experiment.append(loss_train)

                run['train/loss'].append(loss_train)

            transformer_experiment.loc[experiment_id, 'train_loss'] = loss_train
            loss_history['experiment_{}'.format(str(experiment_id))] = loss_list_experiment
            
            # torch.cuda.empty_cache() 
            run.stop()



Skipping experiment 0 with d_model 512, heads8, num_layers5


Skipping experiment 1 with d_model 512, heads8, num_layers10


Skipping experiment 2 with d_model 512, heads16, num_layers5


Skipping experiment 3 with d_model 512, heads16, num_layers10


Skipping experiment 4 with d_model 512, heads32, num_layers5


Skipping experiment 5 with d_model 512, heads32, num_layers10


Skipping experiment 6 with d_model 1024, heads8, num_layers5


Skipping experiment 7 with d_model 1024, heads8, num_layers10


Skipping experiment 8 with d_model 1024, heads16, num_layers5


Skipping experiment 9 with d_model 1024, heads16, num_layers10


Skipping experiment 10 with d_model 1024, heads32, num_layers5


Skipping experiment 11 with d_model 1024, heads32, num_layers10


Skipping experiment 12 with d_model 2048, heads8, num_layers5


Skipping experiment 13 with d_model 2048, heads8, num_layers10


Skipping experiment 14 with d_model 2048, heads16, num_layers5


Running for experiment 15 with d_model 



Epoch [0][0/12]	Loss: 5.156
Epoch [1][0/12]	Loss: 5.049
Epoch [2][0/12]	Loss: 4.608
Epoch [3][0/12]	Loss: 4.526
Epoch [4][0/12]	Loss: 4.342
Epoch [5][0/12]	Loss: 4.071
Epoch [6][0/12]	Loss: 4.036
Epoch [7][0/12]	Loss: 4.092
Epoch [8][0/12]	Loss: 4.076
Epoch [10][0/12]	Loss: 4.040
Epoch [11][0/12]	Loss: 3.927
Epoch [12][0/12]	Loss: 4.045
Epoch [13][0/12]	Loss: 3.989
Epoch [14][0/12]	Loss: 3.873
Epoch [15][0/12]	Loss: 3.832
Epoch [16][0/12]	Loss: 4.009
Epoch [17][0/12]	Loss: 3.784
Epoch [18][0/12]	Loss: 3.777
Epoch [19][0/12]	Loss: 3.729
Epoch [20][0/12]	Loss: 3.806
Epoch [21][0/12]	Loss: 3.710
Epoch [22][0/12]	Loss: 3.549
Epoch [23][0/12]	Loss: 3.696
Epoch [24][0/12]	Loss: 3.566
Epoch [25][0/12]	Loss: 3.625
Epoch [26][0/12]	Loss: 3.291
Epoch [27][0/12]	Loss: 3.621
Epoch [28][0/12]	Loss: 3.374
Epoch [29][0/12]	Loss: 3.372
Epoch [30][0/12]	Loss: 3.402
Epoch [31][0/12]	Loss: 3.352
Epoch [32][0/12]	Loss: 3.220
Epoch [33][0/12]	Loss: 3.341
Epoch [34][0/12]	Loss: 3.450
Epoch [35][0/12]	Loss: 



Epoch [0][0/12]	Loss: 5.202
Epoch [1][0/12]	Loss: 5.062
Epoch [2][0/12]	Loss: 4.734
Epoch [3][0/12]	Loss: 4.371
Epoch [4][0/12]	Loss: 4.236
Epoch [5][0/12]	Loss: 4.167
Epoch [6][0/12]	Loss: 4.178
Epoch [7][0/12]	Loss: 3.889
Epoch [8][0/12]	Loss: 4.102
Epoch [9][0/12]	Loss: 4.067
Epoch [10][0/12]	Loss: 4.009
Epoch [11][0/12]	Loss: 3.869
Epoch [12][0/12]	Loss: 3.987
Epoch [13][0/12]	Loss: 3.984
Epoch [14][0/12]	Loss: 3.879
Epoch [15][0/12]	Loss: 3.601
Epoch [16][0/12]	Loss: 3.539
Epoch [17][0/12]	Loss: 3.507
Epoch [18][0/12]	Loss: 3.415
Epoch [19][0/12]	Loss: 3.446
Epoch [20][0/12]	Loss: 3.041
Epoch [21][0/12]	Loss: 3.090
Epoch [22][0/12]	Loss: 3.004
Epoch [23][0/12]	Loss: 2.906
Epoch [24][0/12]	Loss: 3.072
Epoch [25][0/12]	Loss: 2.878
Epoch [26][0/12]	Loss: 2.795
Epoch [27][0/12]	Loss: 2.622
Epoch [28][0/12]	Loss: 2.341
Epoch [29][0/12]	Loss: 2.367
Epoch [30][0/12]	Loss: 2.489
Epoch [31][0/12]	Loss: 2.395
Epoch [32][0/12]	Loss: 2.238
Epoch [33][0/12]	Loss: 2.099
Epoch [34][0/12]	Loss: 1



Epoch [0][0/12]	Loss: 5.305
Epoch [1][0/12]	Loss: 5.135
Epoch [2][0/12]	Loss: 4.741
Epoch [3][0/12]	Loss: 4.543
Epoch [4][0/12]	Loss: 4.287
Epoch [5][0/12]	Loss: 4.217
Epoch [6][0/12]	Loss: 4.077
Epoch [7][0/12]	Loss: 4.044
Epoch [8][0/12]	Loss: 4.000
Epoch [9][0/12]	Loss: 4.013
Epoch [10][0/12]	Loss: 4.043
Epoch [11][0/12]	Loss: 4.042
Epoch [12][0/12]	Loss: 3.944
Epoch [13][0/12]	Loss: 3.967
Epoch [14][0/12]	Loss: 3.917
Epoch [15][0/12]	Loss: 3.865
Epoch [16][0/12]	Loss: 3.986
Epoch [17][0/12]	Loss: 3.857
Epoch [18][0/12]	Loss: 3.783
Epoch [19][0/12]	Loss: 3.795
Epoch [20][0/12]	Loss: 3.743
Epoch [21][0/12]	Loss: 3.673
Epoch [22][0/12]	Loss: 3.752
Epoch [23][0/12]	Loss: 3.691
Epoch [24][0/12]	Loss: 3.647
Epoch [25][0/12]	Loss: 3.610
Epoch [26][0/12]	Loss: 3.487
Epoch [27][0/12]	Loss: 3.622
Epoch [28][0/12]	Loss: 3.374
Epoch [29][0/12]	Loss: 3.396
Epoch [30][0/12]	Loss: 3.317
Epoch [31][0/12]	Loss: 3.426
Epoch [32][0/12]	Loss: 3.413
Epoch [33][0/12]	Loss: 3.393
Epoch [34][0/12]	Loss: 3



Epoch [0][0/12]	Loss: 5.245
Epoch [1][0/12]	Loss: 4.997
Epoch [2][0/12]	Loss: 4.554
Epoch [3][0/12]	Loss: 4.162
Epoch [4][0/12]	Loss: 4.080
Epoch [5][0/12]	Loss: 4.062
Epoch [6][0/12]	Loss: 4.088
Epoch [7][0/12]	Loss: 4.035
Epoch [8][0/12]	Loss: 3.864
Epoch [9][0/12]	Loss: 4.037
Epoch [10][0/12]	Loss: 3.855
Epoch [11][0/12]	Loss: 3.762
Epoch [12][0/12]	Loss: 3.771
Epoch [13][0/12]	Loss: 3.803
Epoch [14][0/12]	Loss: 3.614
Epoch [15][0/12]	Loss: 3.478
Epoch [16][0/12]	Loss: 3.437
Epoch [17][0/12]	Loss: 3.238
Epoch [18][0/12]	Loss: 3.247
Epoch [19][0/12]	Loss: 3.001
Epoch [20][0/12]	Loss: 3.000
Epoch [21][0/12]	Loss: 2.792
Epoch [22][0/12]	Loss: 2.444
Epoch [23][0/12]	Loss: 2.828
Epoch [24][0/12]	Loss: 2.557
Epoch [25][0/12]	Loss: 2.459
Epoch [26][0/12]	Loss: 2.480
Epoch [27][0/12]	Loss: 2.088
Epoch [28][0/12]	Loss: 2.233
Epoch [29][0/12]	Loss: 2.347
Epoch [30][0/12]	Loss: 2.040
Epoch [31][0/12]	Loss: 2.020
Epoch [32][0/12]	Loss: 1.896
Epoch [33][0/12]	Loss: 1.625
Epoch [34][0/12]	Loss: 1



Epoch [0][0/12]	Loss: 5.212
Epoch [1][0/12]	Loss: 5.006
Epoch [2][0/12]	Loss: 4.476
Epoch [3][0/12]	Loss: 4.137
Epoch [4][0/12]	Loss: 4.158
Epoch [5][0/12]	Loss: 4.043
Epoch [6][0/12]	Loss: 4.092
Epoch [7][0/12]	Loss: 4.070
Epoch [8][0/12]	Loss: 3.994
Epoch [9][0/12]	Loss: 3.931
Epoch [10][0/12]	Loss: 4.079
Epoch [11][0/12]	Loss: 4.009
Epoch [12][0/12]	Loss: 3.951
Epoch [13][0/12]	Loss: 3.858
Epoch [14][0/12]	Loss: 3.801
Epoch [15][0/12]	Loss: 3.749
Epoch [16][0/12]	Loss: 3.854
Epoch [17][0/12]	Loss: 3.660
Epoch [18][0/12]	Loss: 3.801
Epoch [19][0/12]	Loss: 3.621
Epoch [20][0/12]	Loss: 3.695
Epoch [21][0/12]	Loss: 3.556
Epoch [22][0/12]	Loss: 3.574
Epoch [23][0/12]	Loss: 3.519
Epoch [24][0/12]	Loss: 3.421
Epoch [25][0/12]	Loss: 3.348
Epoch [26][0/12]	Loss: 3.515
Epoch [27][0/12]	Loss: 3.439
Epoch [28][0/12]	Loss: 3.519
Epoch [29][0/12]	Loss: 3.381
Epoch [30][0/12]	Loss: 3.212
Epoch [31][0/12]	Loss: 3.406
Epoch [32][0/12]	Loss: 3.111
Epoch [33][0/12]	Loss: 3.181
Epoch [34][0/12]	Loss: 3

## Transformers With Reg

In [None]:
d_model = [512, 1024, 2048, 4096]
heads = [8, 16, 32]
num_layers = [5, 10]
epochs = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)

transformer_experiment = pd.DataFrame(columns = ['experiment_id', 'd_model', 'heads', 'num_layers', 'train_loss'])
loss_history = {}

experiment_id = -1

for d_m in d_model:
    for h in heads:
        for n_l in num_layers: 
            experiment_id += 1
            print('\nRunning for experiment {} with d_model {}, heads{}, num_layers{}\n'.format(experiment_id, d_m, h, n_l))

            run = neptune.init_run(
                project=project,
                api_token=api_token,
                name="experiment_1724_" + str(experiment_id)
            ) 
            run['parameters'] = {
                'd_model': d_m,
                'heads': h,
                'num_layers': n_l
            }

            transformer_experiment.loc[experiment_id, 'experiment_id'] = 'experiment_{}'.format(str(experiment_id))
            transformer_experiment.loc[experiment_id, 'd_model'] = d_m
            transformer_experiment.loc[experiment_id, 'heads'] = h
            transformer_experiment.loc[experiment_id, 'num_layers'] = n_l

            transformer = Transformer(d_model = d_m, heads = h, num_layers = n_l, word_map = word_map, max_len=95)
            transformer = transformer.to(device)
            adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
            transformer_optimizer = AdamWarmup(model_size = d_m, warmup_steps = 4000, optimizer = adam_optimizer)
            criterion = LossWithLS(len(word_map), 0.2)

            loss_list_experiment = []
            for epoch in range(epochs):
                loss_train = train(train_loader, transformer, criterion, epoch)

                state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
                torch.save(state, 'checkpoint_experiment_' + str(epoch) + 'id_'+ str(experiment_id) +'.pth.tar')

                loss_list_experiment.append(loss_train)

                run['train/loss'].append(loss_train)

            transformer_experiment.loc[experiment_id, 'train_loss'] = loss_train
            loss_history['experiment_{}'.format(str(experiment_id))] = loss_list_experiment
            
            run.stop()


In [None]:
import yaml

with open('loss_history_transformer_experiment_KBFILKOM.yaml', 'w') as file:
    documents = yaml.dump(loss_history, file)

In [None]:
transformer_experiment.dropna(inplace=True)
transformer_experiment

In [None]:
import yaml

with open('history_rnn_150524.yaml', 'r') as file:
    history_rnn = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
import matplotlib 
import matplotlib.pyplot as plt

loss_history_key = list(loss_history.keys())

plt.figure(figsize=(15,10))
plt.title("Training loss vs. Number of Epochs")
plt.xlabel("Number of Epochs")
plt.ylabel("Training Loss")
z

for key in loss_history_key:
    loss_list = loss_history[key]
    labels = f'd_model: {transformer_experiment[transformer_experiment["experiment_id"] == key]["d_model"].values[0]}, heads: {transformer_experiment[transformer_experiment["experiment_id"] == key]["heads"].values[0]}, num_layers: {transformer_experiment[transformer_experiment["experiment_id"] == key]["num_layers"].values[0]}'
    plt.plot(loss_list, label = labels)

    
plt.plot(history_rnn['loss'], 
                label = 'LSTM (Baseline FLUENT 2023)', 
                linestyle='dashed', 
                color='black', 
                linewidth=2.5, 
                alpha=0.7, 
                marker='o', 
                markerfacecolor='black', 
                markersize=5
        )

plt.legend()
torch.cuda.is_available()
plt.grid()

# Training

## Vanilla

In [27]:
directory = 'experiment_vanilla_cobain'
create_directory(directory)

d_model = 4096
heads = 16
num_layers = 10
epochs = 100

loss_history_vanilla_transformer = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)
    
transformer = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map, max_len=95)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.2)

for epoch in range(epochs):
    loss_train = train(train_loader, transformer, criterion, epoch)

    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    # torch.save(state, directory + '/checkpoint_' + str(epoch) +'.pth.tar')

    loss_history_vanilla_transformer.append(loss_train)

import yaml 

with open(directory + '/loss_history_vanilla_transformer.yaml', 'w') as file:
    yaml.dump(loss_history_vanilla_transformer, file)

Directory created at experiment_vanilla_cobain




Epoch [0][0/12]	Loss: 5.249
Epoch [1][0/12]	Loss: 4.999
Epoch [2][0/12]	Loss: 4.573
Epoch [3][0/12]	Loss: 4.163
Epoch [4][0/12]	Loss: 4.276
Epoch [5][0/12]	Loss: 4.165
Epoch [6][0/12]	Loss: 4.073
Epoch [7][0/12]	Loss: 3.987
Epoch [8][0/12]	Loss: 3.915
Epoch [9][0/12]	Loss: 3.911
Epoch [10][0/12]	Loss: 3.958
Epoch [11][0/12]	Loss: 3.939
Epoch [12][0/12]	Loss: 3.963
Epoch [13][0/12]	Loss: 3.779
Epoch [14][0/12]	Loss: 3.815
Epoch [15][0/12]	Loss: 3.809
Epoch [16][0/12]	Loss: 3.736
Epoch [17][0/12]	Loss: 3.794
Epoch [18][0/12]	Loss: 3.850
Epoch [19][0/12]	Loss: 3.747
Epoch [20][0/12]	Loss: 3.595
Epoch [21][0/12]	Loss: 3.698
Epoch [22][0/12]	Loss: 3.593
Epoch [23][0/12]	Loss: 3.713
Epoch [24][0/12]	Loss: 3.434
Epoch [25][0/12]	Loss: 3.599
Epoch [26][0/12]	Loss: 3.577
Epoch [27][0/12]	Loss: 3.455
Epoch [28][0/12]	Loss: 3.368


KeyboardInterrupt: 

## Vanilla without Regularization

In [28]:
directory = 'transformers_vanillanoreg_so_13824'
create_directory(directory)

run = neptune_init(directory)

parameters = {
    'd_model': 4096,
    'heads': 32,
    'num_layers': 5,
    'epochs': 100,
}
run['parameters'] = parameters

loss_history_vanillanoreg_transformer = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)
    
    
transformer = Transformer(d_model = parameters['d_model'], heads = parameters['heads'], num_layers = parameters['num_layers'], word_map = word_map, max_len=95)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = parameters['d_model'], warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.2)

for epoch in range(parameters['epochs']):
    loss_train = train(train_loader, transformer, criterion, epoch)

    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}

    loss_history_vanillanoreg_transformer.append(loss_train)
    run['train/loss'].append(loss_train)

    if epoch == parameters['epochs'] - 1:
        torch.save(state, directory + '/checkpoint_' + str(epoch) +'.pth.tar')
        run['model_checkpoint'].upload(directory + '/checkpoint_' + str(epoch) +'.pth.tar')

run['memory_used'] = float(torch.cuda.memory_reserved(0)/1000000)

with open(directory + '/loss_history_vanillanoreg_transformer.yaml', 'w') as file:
    yaml.dump(loss_history_vanilla_transformer, file)

run.stop()

Directory already exists at transformers_vanillanoreg_so_13824
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/andialifs/fluent-tesis-24/e/FLUENT24-113




Epoch [0][0/12]	Loss: 5.324
Epoch [1][0/12]	Loss: 5.026
Epoch [2][0/12]	Loss: 4.673
Epoch [3][0/12]	Loss: 4.191
Epoch [4][0/12]	Loss: 4.270
Epoch [5][0/12]	Loss: 4.240
Epoch [6][0/12]	Loss: 3.993
Epoch [7][0/12]	Loss: 4.007
Epoch [8][0/12]	Loss: 4.073
Epoch [9][0/12]	Loss: 3.871
Epoch [10][0/12]	Loss: 3.847
Epoch [11][0/12]	Loss: 3.718
Epoch [12][0/12]	Loss: 3.753
Epoch [13][0/12]	Loss: 3.843
Epoch [14][0/12]	Loss: 3.661
Epoch [15][0/12]	Loss: 3.553
Epoch [16][0/12]	Loss: 3.316
Epoch [17][0/12]	Loss: 3.262
Epoch [18][0/12]	Loss: 3.122
Epoch [19][0/12]	Loss: 3.103
Epoch [20][0/12]	Loss: 3.082
Epoch [21][0/12]	Loss: 2.794
Epoch [22][0/12]	Loss: 2.717
Epoch [23][0/12]	Loss: 2.575
Epoch [24][0/12]	Loss: 2.574
Epoch [25][0/12]	Loss: 2.477
Epoch [26][0/12]	Loss: 2.228
Epoch [27][0/12]	Loss: 2.300
Epoch [28][0/12]	Loss: 1.882
Epoch [29][0/12]	Loss: 2.025
Epoch [30][0/12]	Loss: 2.018
Epoch [31][0/12]	Loss: 2.172
Epoch [32][0/12]	Loss: 1.993
Epoch [33][0/12]	Loss: 2.086
Epoch [34][0/12]	Loss: 1

NameError: name 'yaml' is not defined

In [30]:
run.stop()

[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/andialifs/fluent-tesis-24/e/FLUENT24-103/metadata


## Decoder Only

In [None]:
directory = 'experiment_deconly_17624'
create_directory(directory)

d_model = 2048
heads = 16
num_layers = 5
epochs = 100

loss_history_decoder_transformer = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)
    
transformer = TransformerDecoderOnly(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map, max_len=95)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.2)

for epoch in range(epochs):
    loss_train = train(train_loader, transformer, criterion, epoch)

    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    torch.save(state, directory + '/checkpoint_' + str(epoch) +'.pth.tar')

    loss_history_decoder_transformer.append(loss_train)

## LSTM_Transformer

In [None]:
directory = 'experiment_2_lstm'

d_model = 1024
heads = 32
num_layers = 10
epochs = 100


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loss_history_lstm_transformer = []

with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
    word_map = json.load(j)
    
transformer = TransformerLSTM(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map, max_len=90)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.2)


for epoch in range(epochs):
    loss_train = train(train_loader, transformer, criterion, epoch)

    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    torch.save(state, directory + '/checkpoint_' + str(epoch) +'.pth.tar')

    loss_history_lstm_transformer.append(loss_train)

In [None]:
import yaml

with open(directory + '/loss_history_lstm_transformer.yaml', 'w') as file:
    yaml.dump(loss_history_lstm_transformer, file)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

directory = 'transformers_vanillanoreg_so_17724'
checkpoint = torch.load(directory + '/checkpoint_99.pth.tar')
transformer = checkpoint['transformer']


In [55]:
question = "tujuan punya filkom itu apa" 
max_len = 50
enc_qus = [word_map.get(word, word_map['<unk>']) for word in question.split()]
question = torch.LongTensor(enc_qus).to(device).unsqueeze(0)
question_mask = (question!=0).to(device).unsqueeze(1).unsqueeze(1)  
sentence = evaluate(transformer, question, question_mask, int(max_len), word_map)
print(sentence)

1 menghasilkan lulusan yang kompeten profesional <unk> <unk> luhur berjiwa <unk> dan berdaya saing internasional menghasilkan <unk> akademika yang mampu mengembangkan <unk> dan mampu mengembangkan <unk> dan teknologi informasi yang bermanfaat dan <unk> <unk> akademik yang <unk> dalam bidang pendidikan penelitian dan <unk> organisasi melalui pembangunan bangsa melalui integrasi


In [59]:
del transformer

In [57]:
question

tensor([[  88, 1076,   25,  993,   87]], device='cuda:0')

In [58]:
question_mask

tensor([[[[True, True, True, True, True]]]], device='cuda:0')

In [56]:
enc_qus

[88, 1076, 25, 993, 87]

In [40]:
transformer.state_dict()

OrderedDict([('embed.embed.weight',
              tensor([[-1.9928, -0.7004,  0.7193,  ..., -1.6292,  1.5850,  0.2723],
                      [-0.3583,  0.6155,  0.0369,  ...,  1.8031, -0.4863,  0.8909],
                      [ 1.1024, -1.2518, -0.0069,  ...,  1.7611, -0.7846, -1.1839],
                      ...,
                      [-0.2117,  0.6299, -0.6148,  ..., -0.0408, -0.5580, -0.8067],
                      [ 0.0884, -0.1558, -1.5991,  ..., -1.6785,  0.0616, -0.3889],
                      [ 1.1627, -0.2376, -1.3329,  ...,  0.1444,  1.2254,  0.7990]],
                     device='cuda:0')),
             ('encoder.layernorm.weight',
              tensor([1.0070, 1.0067, 1.0042,  ..., 0.9963, 1.0000, 0.9963], device='cuda:0')),
             ('encoder.layernorm.bias',
              tensor([ 0.0002,  0.0005,  0.0001,  ..., -0.0012,  0.0002, -0.0009],
                     device='cuda:0')),
             ('encoder.self_multihead.query.weight',
              tensor([[-0.0031,  0.010

In [39]:
checkpoint

{'epoch': 99,
 'transformer': TransformerNoReg(
   (embed): Embeddings(
     (dropout): Dropout(p=0.1, inplace=False)
     (embed): Embedding(1079, 4096)
   )
   (encoder): EncoderLayerNoReg(
     (layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
     (self_multihead): MultiHeadAttention(
       (dropout): Dropout(p=0.1, inplace=False)
       (query): Linear(in_features=4096, out_features=4096, bias=True)
       (key): Linear(in_features=4096, out_features=4096, bias=True)
       (value): Linear(in_features=4096, out_features=4096, bias=True)
       (concat): Linear(in_features=4096, out_features=4096, bias=True)
     )
     (feed_forward): FeedForward(
       (fc1): Linear(in_features=4096, out_features=2048, bias=True)
       (fc2): Linear(in_features=2048, out_features=4096, bias=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
   )
   (decoder): DecoderLayerNoReg(
     (layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
     (self_multi

# Menuhin GPU

In [28]:
while True:
    torch.cuda.empty_cache()

    directory = 'experiment_vanilla_cobain'
    create_directory(directory)

    d_model = 4096
    heads = 16
    num_layers = 10
    epochs = 100

    loss_history_vanilla_transformer = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with open('WORDMAP_corpus_KBFILKOM.json', 'r') as j:
        word_map = json.load(j)
        
    transformer = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map, max_len=95)
    transformer = transformer.to(device)
    adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
    transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
    criterion = LossWithLS(len(word_map), 0.2)

    for epoch in range(epochs):
        loss_train = train(train_loader, transformer, criterion, epoch)

        state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
        # torch.save(state, directory + '/checkpoint_' + str(epoch) +'.pth.tar')

        loss_history_vanilla_transformer.append(loss_train)

    import yaml 

    with open(directory + '/loss_history_vanilla_transformer.yaml', 'w') as file:
        yaml.dump(loss_history_vanilla_transformer, file)

Directory created at experiment_vanilla_cobain




Epoch [0][0/12]	Loss: 5.367
Epoch [1][0/12]	Loss: 5.142
Epoch [2][0/12]	Loss: 4.596
Epoch [3][0/12]	Loss: 4.335
Epoch [4][0/12]	Loss: 4.187
Epoch [5][0/12]	Loss: 4.106
Epoch [6][0/12]	Loss: 4.048
Epoch [7][0/12]	Loss: 4.074
Epoch [8][0/12]	Loss: 3.929
Epoch [9][0/12]	Loss: 4.063
Epoch [10][0/12]	Loss: 4.011
Epoch [11][0/12]	Loss: 3.724
Epoch [12][0/12]	Loss: 3.933
Epoch [13][0/12]	Loss: 3.902
Epoch [14][0/12]	Loss: 3.858
Epoch [15][0/12]	Loss: 3.855
Epoch [16][0/12]	Loss: 3.801
Epoch [17][0/12]	Loss: 3.756
Epoch [18][0/12]	Loss: 3.875
Epoch [19][0/12]	Loss: 3.675
Epoch [20][0/12]	Loss: 3.634
Epoch [21][0/12]	Loss: 3.571
Epoch [22][0/12]	Loss: 3.561
Epoch [23][0/12]	Loss: 3.574
Epoch [24][0/12]	Loss: 3.406
Epoch [25][0/12]	Loss: 3.563
Epoch [26][0/12]	Loss: 3.382
Epoch [27][0/12]	Loss: 3.332
Epoch [28][0/12]	Loss: 3.316
Epoch [29][0/12]	Loss: 3.341
Epoch [30][0/12]	Loss: 3.131
Epoch [31][0/12]	Loss: 3.199
Epoch [32][0/12]	Loss: 3.143
Epoch [33][0/12]	Loss: 3.102
Epoch [34][0/12]	Loss: 3

# Reset Cache

In [87]:
import torch

torch.cuda.empty_cache()

print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 2            |        cudaMalloc retries: 4         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  51970 MiB |  51970 MiB | 446291 MiB | 394320 MiB |
|       from large pool |  51963 MiB |  51963 MiB | 445553 MiB | 393589 MiB |
|       from small pool |      7 MiB |      8 MiB |    738 MiB |    730 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  51970 MiB |  51970 MiB | 446291 MiB | 394320 MiB |
|       from large pool |  51963 MiB |  51963 MiB | 445553 MiB | 393589 MiB |
|       from small pool |      7 MiB |      8 MiB |    738 MiB |    730 MiB |
|---------------------------------------------------------------

[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/andialifs/fluent-tesis-24/e/FLUENT24-101/metadata
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/andialifs/fluent-tesis-24/e/FLUENT24-100/metadata
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/andialifs/fluent-tesis-24/e/FLUENT24-99/metadata
