# Bi-Encoder Re-ranker (Score Fusion)

The embeddings of documents and query is done separately

In [None]:
from sentence_transformers import SentenceTransformer

documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encoding documents
document_embeddings = model.encode(documents, show_progress_bar=True)
len(document_embeddings[0])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


384

In [None]:
# Encoding the query
query = "Natural language processing techniques enhance keyword extraction efficiency."
query_embedding = model.encode(query)
len(query_embedding)

384

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity between the query and document embeddings
similarities = cosine_similarity(np.array([query_embedding]), document_embeddings)
similarities

array([[0.2058035 , 0.56699526, 0.6105436 , 0.5538638 , 0.7148342 ,
        0.81133276, 0.3977112 , 0.7649621 ]], dtype=float32)

In [8]:
most_similar_index = np.argmax(similarities)
most_similar_index

np.int64(5)

In [9]:
most_similar_document = documents[most_similar_index]
most_similar_document

'Efficient keyword extraction enhances search accuracy.'

In [10]:
similarity_score = similarities[0][most_similar_index]
similarity_score

np.float32(0.81133276)

In [11]:
sorted_indices = np.argsort(similarities[0])[::-1]
sorted_indices

array([5, 7, 4, 2, 1, 3, 6, 0])

In [12]:
ranked_documents = [(documents[i], similarities[0][i]) for i in sorted_indices]
ranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.81133276)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.7649621)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.7148342)),
 ('Document analysis involves extracting keywords.', np.float32(0.6105436)),
 ('Keywords are important for keyword-based search.', np.float32(0.56699526)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.5538638)),
 ('Semantic similarity improves document retrieval performance.',
  np.float32(0.3977112)),
 ('This is a list which containing sample documents.', np.float32(0.2058035))]

In [13]:
print("Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")
    

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.8113327622413635
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7649620771408081
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.7148342132568359
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.6105436086654663
Rank 5: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.5669952630996704
Rank 6: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.5538638234138489
Rank 7: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.3977111876010895
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.2058034986257553


In [14]:
print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.8113327622413635
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7649620771408081
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.7148342132568359
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.6105436086654663


In [None]:
from rank_bm25 import BM25Okapi

top_4_documents = [doc[0] for doc in ranked_documents[:4]]
print(top_4_documents)
# Tokenizing the top 4 documents
tokenized_top_4_documents = [doc.split() for doc in top_4_documents]
tokenized_top_4_documents

['Efficient keyword extraction enhances search accuracy.', 'Machine learning algorithms can optimize keyword extraction methods.', 'Understanding document structure aids in keyword extraction.', 'Document analysis involves extracting keywords.']


[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [None]:
# Tokenizing the query
tokenized_query = query.split()
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [None]:
# Using BM25 for re-ranking
bm25=BM25Okapi(tokenized_top_4_documents)
bm25_scores = bm25.get_scores(tokenized_query)
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [20]:
sorted_indices2 = np.argsort(bm25_scores)[::-1]
sorted_indices2

array([0, 2, 1, 3])

In [21]:
reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices2]
reranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(0.19079979534096053)),
 ('Understanding document structure aids in keyword extraction.',
  np.float64(0.1780325227902643)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float64(0.1668667199671815)),
 ('Document analysis involves extracting keywords.', np.float64(0.0))]

In [None]:
# Rerank of top 4 Documents using BM25 scores
print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0


In [None]:
# Displaying the top 4 reranked documents
ranked_documents[:4]


[('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.81133276)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.7649621)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.7148342)),
 ('Document analysis involves extracting keywords.', np.float32(0.6105436))]

# Cross Encoder

The document and query embeddings are created together by the model

In [28]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [24]:
print(top_4_documents)
print(query)

['Efficient keyword extraction enhances search accuracy.', 'Machine learning algorithms can optimize keyword extraction methods.', 'Understanding document structure aids in keyword extraction.', 'Document analysis involves extracting keywords.']
Natural language processing techniques enhance keyword extraction efficiency.


In [None]:
# Creating pairs of query and top 4 documents for cross-encoder
pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])

pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [None]:
# Predicting scores using the cross-encoder
scores = cross_encoder.predict(pairs)
scores

array([ 3.1378732,  0.8421646, -2.9193015, -2.8781915], dtype=float32)

In [None]:
scored_docs = zip(scores, top_4_documents)

# Reranking the documents based on cross-encoder scores
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)
reranked_document_cross_encoder

[(np.float32(3.1378732),
  'Efficient keyword extraction enhances search accuracy.'),
 (np.float32(0.8421646),
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (np.float32(-2.8781915), 'Document analysis involves extracting keywords.'),
 (np.float32(-2.9193015),
  'Understanding document structure aids in keyword extraction.')]

# Cohere Reranking

In [33]:
import cohere

co = cohere.Client("nbDqU1hTVxWmXGbLYI6OnYhp4Cx40MZ5hOmO5oKX")

response = co.rerank(
    model="rerank-english-v3.0",
    query="Natural language processing techniques enhance keyword extraction efficiency.",
    documents=top_4_documents,
    return_documents=True
)

In [34]:
print(response.results[0].document.text)
print(response.results[0].relevance_score)

Efficient keyword extraction enhances search accuracy.
0.99411184


In [35]:
for i in range(4):
  print(f'text: {response.results[i].document.text} score: {response.results[i].relevance_score}')

text: Efficient keyword extraction enhances search accuracy. score: 0.99411184
text: Machine learning algorithms can optimize keyword extraction methods. score: 0.9129032
text: Understanding document structure aids in keyword extraction. score: 0.32885265
text: Document analysis involves extracting keywords. score: 0.02865267
