In [3]:
from langchain_groq import ChatGroq
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
documents = [
    "This a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understsnding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semacntic similarity improves document retrieval performance.",
    "Machine learning techniques can optimize keyword extraction methods."
]
len(documents)

8

In [5]:
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
model = SentenceTransformer(model_name)

In [None]:
embeddings = model.encode(documents)

for i, doc in enumerate(embeddings):
    print(f"Document {i+1} embedding: {doc}")

In [7]:
query = "Natural language processing techniques enhances keyword extraction efficiency."

In [8]:
query_embedding = model.encode(query)
len(query_embedding)

768

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(np.array([query_embedding]), embeddings)
similarities

array([[0.18442771, 0.45484942, 0.57262063, 0.44097388, 0.61043894,
        0.7594256 , 0.5526097 , 0.79599875]], dtype=float32)

In [10]:
similar_docs = np.argmax(similarities)
similar_docs

np.int64(7)

In [11]:
documents[similar_docs]

'Machine learning techniques can optimize keyword extraction methods.'

In [12]:
sorted_indices = np.argsort(similarities[0])[::-1]
sorted_indices

array([7, 5, 4, 2, 6, 1, 3, 0])

In [13]:
ranked_docs = [(documents[i], similarities[0][i]) for i in sorted_indices]
ranked_docs

[('Machine learning techniques can optimize keyword extraction methods.',
  np.float32(0.79599875)),
 ('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.7594256)),
 ('Understsnding document structure aids in keyword extraction.',
  np.float32(0.61043894)),
 ('Document analysis involves extracting keywords.', np.float32(0.57262063)),
 ('Semacntic similarity improves document retrieval performance.',
  np.float32(0.5526097)),
 ('Keywords are important for keyword-based search.', np.float32(0.45484942)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.44097388)),
 ('This a list which containing sample documents.', np.float32(0.18442771))]

In [14]:
print("Ranked Top 4 Documents:")
for doc, score in ranked_docs[:4]:
    print(f"Document: {doc}, Similarity Score: {score:.4f}")

Ranked Top 4 Documents:
Document: Machine learning techniques can optimize keyword extraction methods., Similarity Score: 0.7960
Document: Efficient keyword extraction enhances search accuracy., Similarity Score: 0.7594
Document: Understsnding document structure aids in keyword extraction., Similarity Score: 0.6104
Document: Document analysis involves extracting keywords., Similarity Score: 0.5726


In [15]:
from rank_bm25 import BM25Okapi

top_4_docs = [docs[0] for docs in ranked_docs[:4]]
top_4_docs

['Machine learning techniques can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understsnding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [16]:
tokenized_documents = [doc.split() for doc in top_4_docs]
tokenized_documents

[['Machine',
  'learning',
  'techniques',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Understsnding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [17]:
tokenized_query = query.split()
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhances',
 'keyword',
 'extraction',
 'efficiency.']

In [18]:
bm25 = BM25Okapi(tokenized_documents)
bm25

<rank_bm25.BM25Okapi at 0x1b36b742390>

In [19]:
score = bm25.get_scores(tokenized_query)
score

array([0.93445363, 1.06847885, 0.17803252, 0.        ])

In [20]:
sorted_indices = np.argsort(score)[::-1]
sorted_indices

array([1, 0, 2, 3])

In [21]:
top_4_docs

['Machine learning techniques can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understsnding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [22]:
query

'Natural language processing techniques enhances keyword extraction efficiency.'

In [None]:
reranked_docs_bm25 = [(top_4_docs[i],score[i]) for i in sorted_indices]
reranked_docs_bm25

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(1.0684788539093786)),
 ('Machine learning techniques can optimize keyword extraction methods.',
  np.float64(0.9344536318162162)),
 ('Understsnding document structure aids in keyword extraction.',
  np.float64(0.1780325227902643)),
 ('Document analysis involves extracting keywords.', np.float64(0.0))]