In [13]:
import sys
import os
sys.path.append(os.path.abspath('/home/andyalyfsyah/FLUENT-Chatbot-2023/FLUENT_REFACTORED_24'))

from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer
from sklearn.decomposition import PCA
from torch import nn
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import data as fluent_data
from evaluation_tool import calculate_bleu, count_bleu_score, compute_average_chrf, generate_predictions
from neptune_fluent import Neptune_Fluent 

import torch
import matplotlib.pyplot as plt
import time
import pandas as pd


In [18]:
import importlib.util
import sys

# Specify the file path
file_path = '/home/andyalyfsyah/FLUENT-Chatbot-2023/FLUENT_REFACTORED_24/data.py'

# Load the module
spec = importlib.util.spec_from_file_location("fluent_data", file_path)
fluent_data = importlib.util.module_from_spec(spec)
spec.loader.exec_module(fluent_data)

# Use the variables
qa_paired = fluent_data.qa_paired
qa_paired_eval = fluent_data.qa_paired_eval

start loading data from data.py
finished loading data from data.py, get 44 qa_paired
finished loading data from data.py, get 91 qa_paired_eval
added BOS and EOS token to qa_paired, sample qa_paired:     Pertanyaan                                            Jawaban
0  visi filkom  [BOS]menjadi fakultas yang berdaya saing inter...


In [92]:
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class FLUENTSOTA(nn.Module):
    def __init__(self, enc_model, dec_model, enc_tokenizer, dec_tokenizer, max_length=200, dec_size=1024):
        super(FLUENTSOTA, self).__init__()
        self.enc_model = enc_model
        self.dec_model = dec_model
        self.enc_tokenizer = enc_tokenizer
        self.dec_tokenizer = dec_tokenizer
        self.enc_mapper = nn.Linear(1024, dec_size)
        self.enc_mapper2 = nn.Linear(dec_size, dec_size)

        self.prefix_param = nn.Parameter(torch.randn(1, 1, dec_size))  # Learnable parameter for prefix 

        self.prefix_nn = nn.Linear(dec_size, dec_size)
        self.max_length = max_length
    
    def encoding(self, sentence):
        tokens = self.enc_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        tokens = tokens.to(device)
        with torch.no_grad():
            output = self.enc_model(**tokens)
        enc_logits = output.last_hidden_state.sum(dim=1)
        enc_logits = self.enc_mapper(enc_logits).to(device)
        enc_logits = self.enc_mapper2(enc_logits).to(device)
        return enc_logits
    
    def get_prefix(self, batch_size=1):
        # Process the learnable parameter through prefix_nn layers
        prefix = self.prefix_param.expand(batch_size, 1, -1)
        prefix = self.prefix_nn(prefix)
        return prefix

    def get_embedding(self, sentence):
        tokens = self.dec_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        tokens = tokens.to(device)
        wte = self.dec_model.get_input_embeddings()
        return wte(tokens['input_ids'])

    def dec_tokenizer(self, sentence):
        tokens = self.dec_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        return tokens

    def decoding_train(self, enc_logits, target, target_with_pre):
        prefix = self.get_prefix(batch_size=enc_logits.size(0))
        embed = self.get_embedding(target)
        pref_with_embed = torch.cat((prefix, embed), dim=1)
        output = self.dec_model(inputs_embeds=pref_with_embed, labels=target_with_pre)
        return output

    def generate(self, quest):
        enc_logits = self.encoding(quest)
        prefix_se = '[BOS]'
        prefix_dec_embed = self.get_embedding(prefix_se)
        # prefixs = self.add_prefix(enc_logits)
        
        # pref_with_embed = torch.cat((enc_logits.unsqueeze(dim=0), prefix_dec_embed), dim=1)

        output = self.dec_model.generate(   inputs_embeds=prefix_dec_embed, 
                                            max_length=self.max_length, 
                                            pad_token_id=self.dec_model.config.eos_token_id)
        returned_output = []
        for i in range(len(output[0])):
            if output[0][i] != self.dec_model.config.eos_token_id:
                returned_output.append(output[0][i])
            else:
                break
        return torch.tensor(returned_output).unsqueeze(0)

In [37]:
encoder_id = 'indobenchmark/indobert-large-p1'
print("initiliazing encoder model and tokenizer : {}".format(encoder_id))
enc_tokenizer = AutoTokenizer.from_pretrained(encoder_id, clean_up_tokenization_spaces=True)
enc_model = AutoModel.from_pretrained(encoder_id)

decoder_id = 'indonesian-nlp/gpt2-medium-indonesian'
print("initiliazing decoder model and tokenizer : {}".format(decoder_id))
dec_model = GPT2LMHeadModel.from_pretrained(decoder_id)
dec_tokenizer = GPT2Tokenizer.from_pretrained(decoder_id, clean_up_tokenization_spaces=True)

dec_tokenizer.add_tokens(['[PRE1]'])
dec_tokenizer.add_tokens(['[PRE2]'])
dec_tokenizer.add_tokens(['[PRE3]'])
dec_tokenizer.add_special_tokens({'pad_token': '[PAD]',
                                    'bos_token': '[BOS]',
                                    'eos_token': '[EOS]',
                                    'sep_token': '[SEP]',})
dec_model.config.pad_token_id = dec_tokenizer.pad_token_id
dec_model.config.bos_token_id = dec_tokenizer.bos_token_id
dec_model.config.eos_token_id = dec_tokenizer.eos_token_id
dec_model.config.sep_token_id = dec_tokenizer.sep_token_id
dec_model.resize_token_embeddings(len(dec_tokenizer))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
enc_model = enc_model.to(device)
dec_model = dec_model.to(device)
print("finished initiliazing encoder model and tokenizer")

for param in enc_model.parameters():
    param.requires_grad = False

for param in dec_model.parameters():
    param.requires_grad = False

# for param in dec_model.transformer.h[:-15].parameters():
#     param.requires_grad = True

for param in enc_model.encoder.layer[:-15].parameters():
    param.requires_grad = True

print("Encoder Trainable Parameters : {}%".format(sum(p.numel() for p in enc_model.parameters() if p.requires_grad)/sum(p.numel() for p in enc_model.parameters())*100))
print("Decoder Trainable Parameters : {}%".format(sum(p.numel() for p in dec_model.parameters() if p.requires_grad)/sum(p.numel() for p in dec_model.parameters())*100))

initiliazing encoder model and tokenizer : indobenchmark/indobert-large-p1


KeyboardInterrupt: 

In [93]:
model = FLUENTSOTA(enc_model, dec_model, enc_tokenizer, dec_tokenizer)
model.to(device)

all_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
untrainable_params = all_params - trainable_params
print(f'All parameters: {all_params:,}')
print(f'Trainable parameters: {trainable_params:,} ({trainable_params/all_params*100:.2f}%)')
print(f'Untrainable parameters: {untrainable_params:,} ({untrainable_params/all_params*100:.2f}%)')

All parameters: 693,122,048
Trainable parameters: 116,515,840 (16.81%)
Untrainable parameters: 576,606,208 (83.19%)


In [94]:

bleu_score_eval = pd.DataFrame(columns=['Epoch', '1-gram', '2-gram', '3-gram', '4-gram', 'cumulative-1-gram', 'cumulative-2-gram', 'cumulative-3-gram', 'cumulative-4-gram'])
bleu_score_train = pd.DataFrame(columns=['Epoch', '1-gram', '2-gram', '3-gram', '4-gram', 'cumulative-1-gram', 'cumulative-2-gram', 'cumulative-3-gram', 'cumulative-4-gram'])

questions = qa_paired['Pertanyaan'].apply(lambda x: x.lower().replace('[BOS]', '').replace('[EOS]', '')).to_list()
answers = qa_paired['Jawaban'].apply(lambda x: x.replace('[BOS]', '').replace('[EOS]', '').lower().strip()).to_list()
questions_eval = qa_paired_eval['Pertanyaan'].apply(lambda x: x.lower().replace('[BOS]', '').replace('[EOS]', '')).to_list()
answers_eval = qa_paired_eval['Jawaban'].apply(lambda x: x.replace('[BOS]', '').replace('[EOS]', '').lower().strip()).to_list()

epochs = 500
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

run = Neptune_Fluent.mulai(encoder_id, decoder_id, num_pre_token=0)
bleu_result_eval = {"cumulative-4-gram":0}
bleu_result_train = {"cumulative-4-gram":0}
chrf_result_eval = 0
chrf_result_train = 0


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/andialifs/fluent-tesis-playground-24/e/FLUEN-119


In [96]:
print("start training")
for ep in range(epochs):
    torch.cuda.empty_cache()

    total_loss = 0
    for instance in qa_paired.iterrows():
        # print("---------------")
        optimizer.zero_grad()

        pertanyaan = instance[1]['Pertanyaan']
        jawaban = instance[1]['Jawaban']
        jawaban_withpre = '[PRE1]' + jawaban

        tokenized_jawaban_withpre = model.dec_tokenizer(jawaban_withpre)
        tokenized_jawaban_withpre = torch.tensor(tokenized_jawaban_withpre['input_ids']).unsqueeze(0)

        enc_logits = model.encoding(pertanyaan)
        output = model.decoding_train(enc_logits, target=jawaban, target_with_pre=tokenized_jawaban_withpre)

        loss = output.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    run["train/loss"].append(total_loss)
    print(f'Epoch {ep+1}/{epochs} - Loss: {total_loss:.4f}')
    if (ep+1) % 10 == 0:
        print(f'\n-----------------------------------------')
        test_question = qa_paired['Pertanyaan'].iloc[0]
        outputs = model.generate(test_question)
        decoded_output = model.dec_tokenizer.decode(outputs[0])
        print(f'Q >>> {test_question}')
        print(f'A <<< {decoded_output}')
        test_question = qa_paired['Pertanyaan'].iloc[1]
        outputs = model.generate(test_question)
        decoded_output = model.dec_tokenizer.decode(outputs[0])
        print(f'Q >>> {test_question}')
        print(f'A <<< {decoded_output}')
        test_question = qa_paired['Pertanyaan'].iloc[4]
        outputs = model.generate(test_question)
        decoded_output = model.dec_tokenizer.decode(outputs[0])
        print(f'Q >>> {test_question}')
        print(f'A <<< {decoded_output}')
        print(f'-----------------------------------------\n')

    if (ep+1) % 10 == 0:
        preds_eval = generate_predictions(model, questions_eval)

        bleu_result_eval = calculate_bleu(preds_eval, questions_eval, answers_eval)
        bleu_score_eval = pd.concat([bleu_score_eval, pd.DataFrame({'Epoch': ep+1, **bleu_result_eval}, index=[len(bleu_score_eval)])], ignore_index=True)
        print(f'BLEU Score Eval: {bleu_result_eval["cumulative-4-gram"]:.4f}\n')

        chrf_result_eval = compute_average_chrf(preds_eval, answers_eval)
        print(f'CHRF Score Eval: {chrf_result_eval:.4f}\n')

        preds_train = generate_predictions(model, questions)
        bleu_result_train = calculate_bleu(preds_train, questions, answers)
        bleu_score_train = pd.concat([bleu_score_train, pd.DataFrame({'Epoch': ep+1, **bleu_result_train}, index=[len(bleu_score_train)])], ignore_index=True)
        print(f'BLEU Score Train: {bleu_result_train["cumulative-4-gram"]:.4f}\n')

        chrf_result_train = compute_average_chrf(preds_train, answers)
        print(f'CHRF Score Train: {chrf_result_train:.4f}\n')

    run["eval/chrf"].append(chrf_result_eval)
    run["train/chrf"].append(chrf_result_train)
    run["eval/bleu"].append(bleu_result_eval["cumulative-4-gram"])
    run["train/bleu"].append(bleu_result_train["cumulative-4-gram"])

run.stop()
print("finished training")

start training
Epoch 1/500 - Loss: 212.6453
Epoch 2/500 - Loss: 202.6142
Epoch 3/500 - Loss: 196.0266
Epoch 4/500 - Loss: 192.2536
Epoch 5/500 - Loss: 189.2624
Epoch 6/500 - Loss: 189.3780
Epoch 7/500 - Loss: 185.6183
Epoch 8/500 - Loss: 183.3109
Epoch 9/500 - Loss: 181.6206
Epoch 10/500 - Loss: 179.8918

-----------------------------------------
Q >>> visi filkom
A <<< [PAD], S[PAD] [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1] Pangela,
, yang harus memiliki produk.

 yang.. yang.



 di di-S.
S[PAD].1.1.
2-3S.
1.
1.	1.0:1
1.0:2	2
1.0:2	2
1.1.	
Q >>> misi filkom
A <<< [PAD], S[PAD] [PRE1], S [PRE1], S [PRE1], S [PRE1], S [PRE1], S

KeyboardInterrupt: 

In [77]:
tokenized_jawaban_withpre.shape

torch.Size([1, 33])

In [65]:
enc_logits.shape

torch.Size([1, 1024])

In [71]:
model.get_embedding(jawaban).shape

torch.Size([1, 33, 1024])

In [76]:
enc_logits.unsqueeze(dim=0).shape

torch.Size([1, 1, 1024])

In [84]:
torch.cat(model.get_embedding(jawaban), dim=1).shape

TypeError: cat() received an invalid combination of arguments - got (Tensor, dim=int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [11]:
encoded_jawaban = model.get_embedding(jawaban)

In [13]:
encoded_jawaban.shape

torch.Size([1, 33, 1024])

In [20]:
torch.cat((enc_logits.unsqueeze(dim=0), encoded_jawaban), dim=1).shape

torch.Size([1, 34, 1024])

In [18]:
enc_logits.unsqueeze(dim=0).shape

torch.Size([1, 1, 1024])

In [2]:
# save the model
torch.save(model.state_dict(), 'model_state_dict_best_2225.pth')

In [4]:
test_question = "apa visi filkom"
outputs = model.generate(test_question)
decoded_output = model.dec_tokenizer.decode(outputs[0])
print(f'Q >>> {test_question}')
print(f'A <<< {decoded_output}')

Q >>> apa visi filkom
A <<< menjadi fakultas yang berdaya saing internasional dan berkontribusi kepada pengembangan teknologi informasi dan ilmu komputer untuk menunjang industri dan masyarakat dengan menyelaraskan pelaksanaan pendidikan, penelitian, dan pengabdian kepada masyarakat


In [8]:
test_question = "apa EMAIL pak fitra bachtiar"
outputs = model.generate(test_question)
decoded_output = model.dec_tokenizer.decode(outputs[0])
print(f'Q >>> {test_question}')
print(f'A <<< {decoded_output}')

Q >>> apa EMAIL pak fitra bachtiar
A <<< anglia.ris@gmail.com.brilliantadita@yahoo.com.mynameirawan.web.idokter.com (Telusuri Lebih terperinci BAB I PENDAHULUAN. A. Latar Belakang Masalah. Perkembangan teknologi informasi yang semakin pesat, menuntut perusahaan untuk
BAB I PENDAHULUAN A. Latar Belakang Masalah Perkembangan teknologi informasi yang semakin pesat, menuntut perusahaan untuk terus meningkatkan kualitas dan kuantitas produk yang ditawarkan kepada konsumen. Kualitas produk yang ditawarkan perusahaan dapat dilihat Lebih terperinci BAB I PENDAHULUAN. 1.1 Latar Belakang Masalah. Perkembangan teknologi informasi yang semakin pesat, menuntut perusahaan untuk
BAB I PENDAHULUAN 1.1 Latar Belakang Masalah Perkembangan teknologi informasi yang semakin pesat, menuntut perusahaan untuk terus meningkatkan kualitas dan kuantitas produk yang ditawarkan kepada konsumen. Kualitas produk yang ditawarkan perusahaan dapat Lebih terperinci BAB I PENDAHULUAN. 1.1 Latar Belakang Masalah. Perkemba