In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
documents = [
    "This a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understsnding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semacntic similarity improves document retrieval performance.",
    "Machine learning techniques can optimize keyword extraction methods."
]
len(documents)

8

In [4]:
query = "Natural language processing techniques enhances keyword extraction efficiency."

In [3]:
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')

In [6]:
docs_embeddings = model.encode(documents)

In [7]:
query_embedding = model.encode(query)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarities = cosine_similarity(np.array([query_embedding]), docs_embeddings)
similarities

array([[0.18442771, 0.45484942, 0.57262063, 0.44097388, 0.61043894,
        0.7594256 , 0.5526097 , 0.79599875]], dtype=float32)

In [11]:
index = np.argsort(similarities[0])[::-1]
index

array([7, 5, 4, 2, 6, 1, 3, 0])

In [13]:
ranked_docs = [(documents[i], similarities[0][i]) for i in index]
ranked_docs

[('Machine learning techniques can optimize keyword extraction methods.',
  np.float32(0.79599875)),
 ('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.7594256)),
 ('Understsnding document structure aids in keyword extraction.',
  np.float32(0.61043894)),
 ('Document analysis involves extracting keywords.', np.float32(0.57262063)),
 ('Semacntic similarity improves document retrieval performance.',
  np.float32(0.5526097)),
 ('Keywords are important for keyword-based search.', np.float32(0.45484942)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.44097388)),
 ('This a list which containing sample documents.', np.float32(0.18442771))]

In [20]:
top_4_docs = [docs[0] for docs in ranked_docs[:4]]
top_4_docs

['Machine learning techniques can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understsnding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [21]:
from sentence_transformers import CrossEncoder
encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [22]:
pairs = []
for doc in top_4_docs:
    pairs.append((query, doc))

pairs

[('Natural language processing techniques enhances keyword extraction efficiency.',
  'Machine learning techniques can optimize keyword extraction methods.'),
 ('Natural language processing techniques enhances keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'),
 ('Natural language processing techniques enhances keyword extraction efficiency.',
  'Understsnding document structure aids in keyword extraction.'),
 ('Natural language processing techniques enhances keyword extraction efficiency.',
  'Document analysis involves extracting keywords.')]

In [23]:
scores = encoder.predict(pairs)
scores

array([ 2.6330397,  3.4095516, -2.921854 , -3.2398996], dtype=float32)

In [24]:
scored_docs = zip(scores, top_4_docs)
scored_docs

<zip at 0x180584e9400>

In [25]:
reranked_docs_cross_encoder = sorted(scored_docs, key=lambda x: x[0], reverse=True)
reranked_docs_cross_encoder

[(np.float32(3.4095516),
  'Efficient keyword extraction enhances search accuracy.'),
 (np.float32(2.6330397),
  'Machine learning techniques can optimize keyword extraction methods.'),
 (np.float32(-2.921854),
  'Understsnding document structure aids in keyword extraction.'),
 (np.float32(-3.2398996), 'Document analysis involves extracting keywords.')]