In [1]:
pip install txtai

Collecting txtai
  Downloading txtai-5.5.1-py3-none-any.whl (169 kB)
     -------------------------------------- 169.8/169.8 kB 1.1 MB/s eta 0:00:00
Collecting transformers>=4.22.0
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 2.9 MB/s eta 0:00:00
Collecting faiss-cpu>=1.7.1.post2
  Downloading faiss_cpu-1.7.4-cp39-cp39-win_amd64.whl (10.8 MB)
     ---------------------------------------- 10.8/10.8 MB 2.9 MB/s eta 0:00:00
Collecting torch>=1.6.0
  Downloading torch-2.0.1-cp39-cp39-win_amd64.whl (172.4 MB)
     -------------------------------------- 172.4/172.4 MB 1.5 MB/s eta 0:00:00
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp39-cp39-win_amd64.whl (263 kB)
     ------------------------------------ 263.9/263.9 kB 704.5 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
     -------------------------------------- 236.8/236.8 kB 1

In [2]:
from platform import python_version
print(python_version())

3.9.13


In [3]:
from txtai.embeddings import Embeddings
import json

In [4]:
embeddings = Embeddings({
    
    "path": "sentence-transformers/all-MiniLM-L6-v2"
})

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [10]:
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to C:\Users\Rafay
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rafay
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(tokens)

In [12]:
processed_documents = [preprocess_text(doc) for doc in documents]

In [13]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_documents)

In [14]:
def search(query):
    processed_query = preprocess_text(query)
    query_vector = vectorizer.transform([processed_query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    ranked_indices = similarity_scores.argsort()[::-1]
    return [(documents[i], similarity_scores[i]) for i in ranked_indices]

In [15]:
query = "This is a document"
results = search(query)

In [16]:
for result, score in results:
    print(f"Document: {result}\nSimilarity Score: {score}\n")

Document: This document is the second document.
Similarity Score: 0.78722297610404

Document: Is this the first document?
Similarity Score: 0.6292275146695526

Document: This is the first document.
Similarity Score: 0.6292275146695526

Document: And this is the third one.
Similarity Score: 0.0

