In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [48]:
#sample documents
documents = [
    "Anime is great",
    "Where is hell have you been",
    "Anime is the greatest media ever created",
    "Movie is a media used for generation"
]



In [49]:
query = "Best media ever"

In [50]:
import re

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text



In [51]:
preprocessed_doc = [preprocess_text(doc) for doc in documents]

In [52]:
preprocessed_doc

['anime is great',
 'where is hell have you been',
 'anime is the greatest media ever created',
 'movie is a media used for generation']

In [53]:
preprocessed_query = preprocess_text(query)

In [54]:
preprocessed_query

'best media ever'

In [55]:
vector = TfidfVectorizer()

In [56]:
X = vector.fit_transform(preprocessed_doc)

In [57]:
X.toarray()

array([[0.5728925 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.72664149, 0.        , 0.        , 0.        ,
        0.37919167, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.43551105, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.43551105, 0.43551105,
        0.22726773, 0.        , 0.        , 0.        , 0.        ,
        0.43551105, 0.43551105],
       [0.33570696, 0.        , 0.42580171, 0.42580171, 0.        ,
        0.        , 0.        , 0.42580171, 0.        , 0.        ,
        0.222201  , 0.33570696, 0.        , 0.42580171, 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.45203489,
        0.45203489, 0.        , 0.        , 0.        , 0.        ,
        0.23589056, 0.3563895 , 0.45203489, 0.        , 0.45203489,
        0.        , 0.        ]])

In [58]:
query_embedding = vector.transform([preprocessed_query])

In [59]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.78528828, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6191303 , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

using tfidf instead of hugging face transformer. so that the vector representation is of sparse vector instead of dense vector


In [60]:
similarities = cosine_similarity(X, query_embedding)

In [61]:
similarities

array([[0.        ],
       [0.        ],
       [0.54222344],
       [0.22065154]])

In [63]:
np.argsort(similarities, axis = 0)

array([[0],
       [1],
       [3],
       [2]])

In [64]:
ranked_indices = np.argsort(similarities, axis = 0)[::-1].flatten()

In [65]:
ranked_indices

array([2, 3, 1, 0])

In [66]:
ranked_documents = [documents[i] for i in ranked_indices]

In [67]:
ranked_documents

['Anime is the greatest media ever created',
 'Movie is a media used for generation',
 'Where is hell have you been',
 'Anime is great']

In [68]:
for i, doc in enumerate(ranked_documents):
  print(f"Rank {i} : {doc}")

Rank 0 : Anime is the greatest media ever created
Rank 1 : Movie is a media used for generation
Rank 2 : Where is hell have you been
Rank 3 : Anime is great
