In [1]:
import torch
from sentence_transformers import SentenceTransformer, models
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
import pandas as pd

In [2]:
# Load Model
word_emb = models.Transformer("denaya/indoSBERT-large")
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])

In [3]:
#Select the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [4]:
df = pd.read_csv('../datasets/qa_id_clean.csv')

train_examples = [
    InputExample(texts=[row['question_id'], row['answer_id']])
    for _, row in df.iterrows()
]
df

Unnamed: 0,idx,question_id,answer_id
0,0,Apa satu - satunya buku yang bebas dari keragu...,(Kitab ini) yakni yang dibaca oleh Muhammad sa...
1,1,"(Apakah buah-buahan di surga) maksudnya, buah-...",(Dan sampaikanlah berita gembira) kabarkanlah ...
2,2,Berapa banyak kematian dan berapa banyak nyawa...,"Setelah itu, Dia menghidupkan kalian lagi untu..."
3,3,Berapa banyak surga yang ada?,(Dialah yang telah menciptakan bagimu segala y...
4,4,Adam tidak pernah disentuh oleh malaikat kecua...,(Dan diajarkan-Nya kepada Adam nama-nama) maks...
...,...,...,...
1186,1218,(Dan siapakah) Istifham atau kata tanya di sin...,Dia Sarah.
1187,1219,Sesungguhnya Ibrahim itu benar-benar seorang y...,Sesungguhnya di antara yang mengikuti jejak da...
1188,1220,Nabi Luth merasa takut kaumnya akan melakukan ...,(Maka Yusuf berlaku takabur terhadap mereka ka...
1189,1221,(Mengapa) lafal Bal di sini menunjukkan makna ...,Jika aku mempunyai kekuatan dan dukungan dari ...


In [5]:
# For the MultipleNegativesRankingLoss, it is important
# that the batch does not contain duplicate entries, i.e.
# no two equal queries and no two equal paragraphs.
# To ensure this, we use a special data loader
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=4)
# MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
# and trains the model so that is is suitable for semantic search
train_loss = losses.MultipleNegativesRankingLoss(model)

In [6]:
#Tune the model
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6766


In [4]:
output_dir = "indo-sbert-finetuned-qa"


In [None]:
model.save(output_dir)

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model = SentenceTransformer(output_dir)

In [8]:
model.push_to_hub(output_dir, commit_message="Initial fine-tuned IndoSBERT QA")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/devvevan/indo-sbert-finetuned-qa/commit/c4c8fe6f5a04833d349efb33189cec3bce0e2281'

In [9]:
model_finetune = SentenceTransformer('devvevan/indo-sbert-finetuned-qa')

modules.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/26.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/56.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/306 [00:00<?, ?B/s]