# Test BERT

In [1]:
a = "Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa."
b = "(Allah) tidak beranak dan tidak pula diperanakkan."
c = "dari kejahatan (bisikan) setan yang bersembunyi,"
d = "yang membisikkan (kejahatan) ke dalam dada manusia,"
e = "dari (golongan) jin dan manusia."
query = "apakah sifat Allah?"

## Pre

In [12]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Inisialisasi tokenizer dan model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [3]:
docs = [a,b,c,d,e]

### Embedding

In [4]:
# Fungsi mendapatkan input output
def get_input_output(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    
    return inputs, outputs


In [5]:
def get_embedding(text):
    inputs, outputs = get_input_output(text)
    # embedding dengan[CLS] token
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Rata-rata embedding
def get_average_embedding(text):
    inputs, outputs = get_input_output(text)
    # Mengambil rata-rata dari semua token
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Max embedding
def get_max_embedding(text):
    inputs, outputs = get_input_output(text)
    # Mengambil nilai maksimum dari semua token
    return outputs.last_hidden_state.max(dim=1)

In [6]:
doc_embeddings = get_average_embedding(docs)
doc_embeddings

array([[ 0.07509732,  0.49811694, -0.33625567, ..., -0.2211533 ,
         0.0691896 ,  0.12043393],
       [ 0.35042688,  0.19091715, -0.4395075 , ..., -0.14120379,
        -0.32346094,  0.2990469 ],
       [ 0.37830707,  0.09564307, -0.6380641 , ..., -0.4317989 ,
        -0.51349443,  0.26083744],
       [ 0.26075327,  0.13323869, -0.76686186, ..., -0.40824568,
        -0.5882899 ,  0.22992945],
       [-0.24809662, -0.1499107 , -0.15878761, ...,  0.04959118,
         0.03342872, -0.11998825]], dtype=float32)

In [7]:
query_embedding = get_average_embedding(query)
query_embedding

array([[-1.29455728e-02,  2.47544661e-01, -9.55722928e-01,
        -1.07036710e+00, -2.98533201e-01,  2.28359904e-02,
         7.56321967e-01,  1.72838509e-01, -6.21139742e-02,
        -2.05078840e-01,  2.08927751e-01, -6.44430444e-02,
        -1.63532794e-01,  1.45086467e-01, -2.21821815e-01,
         4.66727316e-01,  8.91471952e-02,  2.75171012e-01,
        -3.34524930e-01,  5.95268250e-01, -2.10674718e-01,
        -3.51609290e-02, -1.70310393e-01, -3.13347936e-01,
         3.73417288e-01,  3.28889489e-01,  3.10039967e-01,
         2.23296821e-01, -2.48845741e-01, -1.20177537e-01,
        -1.90209925e-01,  1.36616945e-01, -4.05477881e-01,
         1.30625248e-01, -2.90828440e-02,  2.34384611e-01,
         1.81003600e-01,  3.31435829e-01, -6.75425455e-02,
         5.40553987e-01, -5.44589043e-01, -6.71113670e-01,
        -4.91361022e-01,  3.16072494e-01,  1.09464951e-01,
        -4.56363142e-01,  5.74175455e-02,  1.50016807e-02,
        -2.62891561e-01,  1.17293015e-01,  1.05489627e-0

In [11]:
embedding_method = get_embedding

doc_embeddings = [get_embedding(doc) for doc in docs]
doc_embeddings

[array([[-7.28537917e-01,  5.99570990e-01, -3.54755819e-01,
         -6.19610310e-01, -5.18256485e-01,  3.33274305e-01,
          6.76107883e-01,  4.04116899e-01, -1.51512697e-01,
         -3.63368064e-01, -1.35359615e-01,  3.27885717e-01,
         -3.33769023e-01,  6.23366475e-01,  5.69079995e-01,
          4.49398667e-01, -2.52441585e-01,  3.36901277e-01,
          2.83238173e-01,  2.22648293e-01, -4.38216448e-01,
         -6.97299987e-02, -1.57086447e-01, -7.44494870e-02,
          4.31113839e-01,  2.69259334e-01,  2.56258398e-01,
          1.34301856e-02, -2.58968845e-02,  1.16055019e-01,
         -3.06688637e-01,  4.04180348e-01, -4.27829087e-01,
         -2.04765685e-02,  3.76607805e-01,  2.39164755e-01,
          1.85631350e-01, -7.51097724e-02,  2.36293033e-01,
         -7.58920163e-02,  1.06385790e-01,  3.79870236e-02,
          3.25797498e-01, -2.06328809e-01, -9.24938545e-02,
         -4.19134259e-01, -2.83085394e+00,  3.12698632e-02,
         -6.25055432e-01, -5.10766029e-0

### Tokenisasi - Embedding

In [8]:
# Cek similarity dengan dokumen a

input_a = tokenizer(a, return_tensors='pt', padding=True, truncation=True)
input_query = tokenizer(query, return_tensors='pt', padding=True, truncation=True)

# Get hasil tokenisasi
token_a = tokenizer.convert_ids_to_tokens(input_a['input_ids'][0])
token_a_id = input_a['input_ids'][0].tolist()

token_query = tokenizer.convert_ids_to_tokens(input_query['input_ids'][0])
token_query_id = input_query['input_ids'][0].tolist()

print(f"TOKEN_A:\n{token_a}")
print(f"TOKEN_A_ID:\n{token_a_id}")
print()
print(f"TOKEN_QUERY:\n{token_query}")
print(f"TOKEN_QUERY_ID:\n{token_query_id}")
print()
print("="*100)
print()
print()
# Get embedding
doc_embedding = get_embedding(a)
query_embedding = get_embedding(query)

print(f"DOC_EMBEDDING:\n{doc_embedding}")
print()
print(f"QUERY_EMBEDDIN:\n{query_embedding}")
print()
print("="*100)
print()
print()



TOKEN_A:
['[CLS]', 'kata', '##kan', '##lah', '(', 'muhammad', ')', ',', '“', 'dial', '##ah', 'allah', ',', 'yang', 'maha', 'esa', '.', '[SEP]']
TOKEN_A_ID:
[101, 29354, 9126, 14431, 1006, 7187, 1007, 1010, 1523, 13764, 4430, 16455, 1010, 8675, 24404, 28776, 1012, 102]

TOKEN_QUERY:
['[CLS]', 'ap', '##aka', '##h', 'si', '##fat', 'allah', '?', '[SEP]']
TOKEN_QUERY_ID:
[101, 9706, 11905, 2232, 9033, 27753, 16455, 1029, 102]



DOC_EMBEDDING:
[[-7.28537917e-01  5.99570990e-01 -3.54755819e-01 -6.19610310e-01
  -5.18256485e-01  3.33274305e-01  6.76107883e-01  4.04116899e-01
  -1.51512697e-01 -3.63368064e-01 -1.35359615e-01  3.27885717e-01
  -3.33769023e-01  6.23366475e-01  5.69079995e-01  4.49398667e-01
  -2.52441585e-01  3.36901277e-01  2.83238173e-01  2.22648293e-01
  -4.38216448e-01 -6.97299987e-02 -1.57086447e-01 -7.44494870e-02
   4.31113839e-01  2.69259334e-01  2.56258398e-01  1.34301856e-02
  -2.58968845e-02  1.16055019e-01 -3.06688637e-01  4.04180348e-01
  -4.27829087e-01 -2.04765685

## Test Similarity

In [9]:
similarity_result = cosine_similarity(query_embedding, doc_embedding)
similarity_result

array([[0.89550143]], dtype=float32)

In [10]:

# embedding rata-rata
def get_average_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

doc_a = a

# Embedding
doc_a_embedding = get_average_embedding(doc_a)
query_embedding = get_average_embedding(query)

# Menghitung dot product
dot_product = np.dot(doc_a_embedding, query_embedding)

# Menghitung norm vektor
norm_doc_a = np.linalg.norm(doc_a_embedding)
norm_query = np.linalg.norm(query_embedding)

# Menghitung cosine similarity
cosine_similarity = dot_product / (norm_doc_a * norm_query)

print(f"Similarity average embedding dokumen a dan query: {cosine_similarity}")

Similarity average embedding dokumen a dan query: 0.8138759136199951


# Dokumen Asli

## Coba Training

In [14]:
import pandas as pd

# Baca dataset
df = pd.read_csv('dataset_preprocessed.csv')
print(df.head())

     id  suraId  verseID                            ayahText  \
0  6221     112        1           قُلْ هُوَ اللّٰهُ اَحَدٌۚ   
1  6222     112        2                 اَللّٰهُ الصَّمَدُۚ   
2  6223     112        3        لَمْ يَلِدْ وَلَمْ يُوْلَدْۙ   
3  6224     112        4  وَلَمْ يَكُنْ لَّهٗ كُفُوًا اَحَدٌ   
4  6225     113        1    قُلْ اَعُوْذُ بِرَبِّ الْفَلَقِۙ   

                                            indoText  \
0  Katakanlah (Muhammad), “Dialah Allah, Yang Mah...   
1               Allah tempat meminta segala sesuatu.   
2  (Allah) tidak beranak dan tidak pula diperanak...   
3     Dan tidak ada sesuatu yang setara dengan Dia."   
4  Katakanlah, “Aku berlindung kepada Tuhan yang ...   

                         readText  \
0              qul huwallāhu aḥad   
1                   allāhuṣ-ṣamad   
2          lam yalid wa lam yụlad   
3  wa lam yakul lahụ kufuwan aḥad   
4        qul a'ụżu birabbil-falaq   

                                      processed_text  


In [17]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from datasets import Dataset

# Load model dan tokenizer
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:


# Konversi DataFrame ke Dataset Hugging Face
dataset = Dataset.from_pandas(df[['indoText']])

# Tokenisasi dataset
def tokenize_function(examples):
    return tokenizer(examples['indoText'], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Konfigurasi pelatihan
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Lanjutkan pelatihan
trainer.train()

Map: 100%|██████████| 15/15 [00:00<00:00, 260.78 examples/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`