In [2]:
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
corpus = [
    "Sesungguhnya Allah menyuruh kamu berlaku adil dan berbuat kebajikan.",  
    "Janganlah sekali-kali kebencianmu terhadap suatu kaum mendorong kamu untuk berlaku tidak adil.",
    "Allah tidak menyukai orang yang berbuat zalim."
]
query = "adil"

In [4]:
# BM25
tokenized_corpus = [doc.lower().split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = query.lower().split()
bm25_scores = bm25.get_scores(query)


In [5]:
# Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy() #CLStoken

query_embedding = get_bert_embedding(query)
doc_embeddings = [get_bert_embedding(doc) for doc in corpus]

bert_scores = [cosine_similarity(query_embedding, doc_embedding).item() for doc_embedding in doc_embeddings]




In [6]:
# Match
alpha = 0.5  # You can tune this parameter
hybrid_scores = [alpha * bm25_scores[i] + (1 - alpha) * bert_scores[i] for i in range(len(corpus))]

In [7]:
print("BM25 Scores:", bm25_scores)
print("BERT Scores:", bert_scores)
print("Hybrid Scores:", hybrid_scores)

BM25 Scores: [0.51916942 0.         0.        ]
BERT Scores: [0.9874471426010132, 0.9884476661682129, 0.987008810043335]
Hybrid Scores: [0.7533082802922986, 0.49422383308410645, 0.4935044050216675]
