In [55]:
ayat = {
    (112,1) : "Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.",
    (112,3) : "(Allah) tidak beranak dan tidak pula diperanakkan.",
    (114,4) : "dari kejahatan (bisikan) setan yang bersembunyi,",
    (114,5) : "yang membisikkan (kejahatan) ke dalam dada manusia,",
    (114,6) : "dari (golongan) jin dan manusia."
    }

In [56]:
for key, value in ayat.items():
    print(f"Key: {key}, Value: {value}")

Key: (112, 1), Value: Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.
Key: (112, 3), Value: (Allah) tidak beranak dan tidak pula diperanakkan.
Key: (114, 4), Value: dari kejahatan (bisikan) setan yang bersembunyi,
Key: (114, 5), Value: yang membisikkan (kejahatan) ke dalam dada manusia,
Key: (114, 6), Value: dari (golongan) jin dan manusia.


In [57]:
query = "Siapakah Allah? Tuhan?"

In [58]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModel.from_pretrained("indolem/indobert-base-uncased")

# Function to get embedding from BERT with visible tokenization output
def get_embedding(text):
    # Tokenizing the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    # Print the tokenized components
    tokens = tokenizer.tokenize(text)  # Get tokens as text
    token_ids = inputs['input_ids'][0].numpy()  # Convert tensor to numpy array for readability
    
    # Display tokenization details
    print(f"Original Text: {text}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    print(f"Attention Mask: {inputs['attention_mask'][0].numpy()}")
    print("\n" + "="*50 + "\n")
    
    # Pass through model without computing gradients
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Return the embedding from [CLS] token representation
    return outputs.last_hidden_state[:, 0, :].numpy()

In [59]:
query_embedding = get_embedding(query)


Original Text: Siapakah Allah? Tuhan?
Tokens: ['siapakah', 'allah', '?', 'tuhan', '?']
Token IDs: [    3 11436  2211    35  2702    35     4]
Attention Mask: [1 1 1 1 1 1 1]




In [60]:
ayat_embeddings = {}
# Iterate over ayat items and compute embeddings
for key, text in ayat.items():
    embedding = get_embedding(text)  # Get embedding for each ayat
    ayat_embeddings[key] = embedding  # Store in dictionary with (surat, ayat) as key



Original Text: Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.
Tokens: ['katakanlah', '(', 'muhammad', ')', ',', '“', 'dialah', 'allah', ',', 'yang', 'maha', 'esa', '.']
Token IDs: [    3 15901    12  3344    13    16   394 30087  2211    16  1497  5774
 12319    18     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Original Text: (Allah) tidak beranak dan tidak pula diperanakkan.
Tokens: ['(', 'allah', ')', 'tidak', 'beranak', 'dan', 'tidak', 'pula', 'diper', '##ana', '##kk', '##an', '.']
Token IDs: [    3    12  2211    13  1580 21196  1501  1580  2547  1942  1638 20326
  1476    18     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Original Text: dari kejahatan (bisikan) setan yang bersembunyi,
Tokens: ['dari', 'kejahatan', '(', 'bisikan', ')', 'setan', 'yang', 'bersembunyi', ',']
Token IDs: [    3  1542  5390    12 27964    13  7904  1497 12466    16     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1]


Original Text: yang membisikkan (kejahatan) ke dalam dada manusia,


In [61]:
# Display embeddings for each ayat
for key, embedding in ayat_embeddings.items():
    print(f"Ayat {key}:")
    print(embedding)
    print("\n" + "="*50 + "\n")

Ayat (112, 1):
[[-8.73601437e-02 -4.51500043e-02  2.97284544e-01 -1.79190129e-01
   2.85478532e-01 -1.04392755e+00 -8.80445123e-01  1.09398507e-01
  -3.89738411e-01  4.87178773e-01 -1.59396112e+00 -2.37119526e-01
   2.44336918e-01 -9.73214731e-02 -4.05141532e-01 -6.06457353e-01
  -6.70627058e-01  4.37325060e-01  4.05439496e-01 -9.72454548e-02
  -1.26650810e+00 -3.61795902e-01  4.45035219e-01 -5.13909161e-02
   5.96818447e-01  2.84123152e-01  9.83828679e-02 -8.65835369e-01
   2.95012426e+00  2.23200187e-01 -6.72559559e-01 -3.52513909e-01
  -1.41514093e-02  6.14915907e-01  1.21849167e+00  7.45832086e-01
  -2.40998432e-01 -1.62848175e+00  8.89930651e-02 -4.82536495e-01
   3.73176277e-01  3.46411854e-01  1.40414762e+00 -4.49392349e-01
   1.36144209e+00 -2.03973264e-01 -2.19901398e-01 -5.72542906e-01
   2.63175607e-01  9.69822943e-01  7.15298295e-01  5.05855143e-01
  -1.17385697e+00  2.63335794e-01  1.38817716e+00  5.08519113e-01
  -3.80247176e-01  1.00924325e+00 -6.74903035e-01  1.40860140

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

# Dictionary to store embeddings of ayat and their similarity scores
ayat_embeddings = {}
similarities = {}

# Iterate over ayat items, compute embeddings, and calculate similarity with query
for key, text in ayat.items():
    ayat_embedding = get_embedding(text)  # Get embedding for each ayat
    ayat_embeddings[key] = ayat_embedding  # Store embedding for reference
    # Calculate cosine similarity between query and ayat embeddings
    similarity_score = cosine_similarity(query_embedding, ayat_embedding)[0][0]
    similarities[key] = similarity_score

Original Text: Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.
Tokens: ['katakanlah', '(', 'muhammad', ')', ',', '“', 'dialah', 'allah', ',', 'yang', 'maha', 'esa', '.']
Token IDs: [    3 15901    12  3344    13    16   394 30087  2211    16  1497  5774
 12319    18     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Original Text: (Allah) tidak beranak dan tidak pula diperanakkan.
Tokens: ['(', 'allah', ')', 'tidak', 'beranak', 'dan', 'tidak', 'pula', 'diper', '##ana', '##kk', '##an', '.']
Token IDs: [    3    12  2211    13  1580 21196  1501  1580  2547  1942  1638 20326
  1476    18     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Original Text: dari kejahatan (bisikan) setan yang bersembunyi,
Tokens: ['dari', 'kejahatan', '(', 'bisikan', ')', 'setan', 'yang', 'bersembunyi', ',']
Token IDs: [    3  1542  5390    12 27964    13  7904  1497 12466    16     4]
Attention Mask: [1 1 1 1 1 1 1 1 1 1 1]


Original Text: yang membisikkan (kejahatan) ke dalam dada manusia,


In [63]:
# Rank ayat by similarity
ranked_ayat = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Display ranked ayat with their similarity scores
for key, score in ranked_ayat:
    print(f"Ayat {key} \n(Score: {score}): {ayat[key]}")
    print( "\n", "="*50, "\n")

Ayat (112, 1) 
(Score: 0.25088971853256226): Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.


Ayat (112, 3) 
(Score: 0.24635805189609528): (Allah) tidak beranak dan tidak pula diperanakkan.


Ayat (114, 5) 
(Score: 0.22163794934749603): yang membisikkan (kejahatan) ke dalam dada manusia,


Ayat (114, 6) 
(Score: 0.2055586278438568): dari (golongan) jin dan manusia.


Ayat (114, 4) 
(Score: 0.16919982433319092): dari kejahatan (bisikan) setan yang bersembunyi,




# BM25

In [None]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

import numpy as np
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

In [68]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
import numpy as np

# Sample ayat dictionary
contoh_ayat = {
    (112,1) : "Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.",
    (112,3) : "(Allah) tidak beranak dan tidak pula diperanakkan.",
    (114,4) : "dari kejahatan (bisikan) setan yang bersembunyi,",
    (114,5) : "yang membisikkan (kejahatan) ke dalam dada manusia,",
    (114,6) : "dari (golongan) jin dan manusia."
}

# Convert dictionary to list for use in BM25
docs = list(contoh_ayat.values())
ayat_keys = list(contoh_ayat.keys())

# Query for Information Retrieval
query = "Siapa Allah? Tuhan? bukan manusia?"

# Define BM25 tokenizer function
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)
        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

# Tokenize the corpus
tokenized_corpus = [bm25_tokenizer(doc) for doc in docs]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

# Tokenize the query and calculate BM25 scores
print("Query:", query)
bm25_scores = bm25.get_scores(bm25_tokenizer(query))

# Get top 10 results
top_n = np.argpartition(bm25_scores, -5)[-5:]
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

# Display top 10 BM25 results
print("Top-10 lexical search (BM25) hits:")
for hit in bm25_hits[0:5]:
    ayat_key = ayat_keys[hit['corpus_id']]
    ayat_text = docs[hit['corpus_id']]
    print(f"\t{hit['score']:.3f}\tAyat {ayat_key}: {ayat_text}")


Query: Siapa Allah? Tuhan? bukan manusia?
Top-10 lexical search (BM25) hits:
	0.373	Ayat (114, 6): dari (golongan) jin dan manusia.
	0.323	Ayat (112, 3): (Allah) tidak beranak dan tidak pula diperanakkan.
	0.323	Ayat (112, 1): Katakanlah (Muhammad), “Dialah Allah, Yang Maha Esa.
	0.323	Ayat (114, 5): yang membisikkan (kejahatan) ke dalam dada manusia,
	0.000	Ayat (114, 4): dari kejahatan (bisikan) setan yang bersembunyi,
