In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [4]:
query="keyword-based search"

In [5]:
for doc in documents:
    print(doc)

This is a list which containig sample documents.
Keywords are important for keyword-based search.
Document analysis involves extracting keywords.
Keyword-based search relies on sparse embeddings.


In [6]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [7]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [8]:
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [9]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)

Preprocessed Documents:
this is a list which containig sample documents
keywords are important for keywordbased search
document analysis involves extracting keywords
keywordbased search relies on sparse embeddings


In [13]:
print("Preprocessed Query:")

Preprocessed Query:


In [14]:
preprocessed_query = preprocess_text(query)

In [15]:
preprocessed_query

'keywordbased search'

In [16]:
vector=TfidfVectorizer()

In [17]:
X=vector.fit_transform(preprocess_documents)

In [18]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [19]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [20]:
query_embedding=vector.transform([preprocessed_query])

In [22]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [23]:
similarities = cosine_similarity(X, query_embedding)

In [24]:
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [26]:
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]], dtype=int64)

In [27]:
#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [28]:
ranked_documents = [documents[i] for i in ranked_indices]

In [29]:
ranked_indices


array([1, 3, 2, 0], dtype=int64)

In [30]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [31]:
query

'keyword-based search'

In [32]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [26]:
#https://huggingface.co/sentence-transformers

In [34]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [35]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [36]:
# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [37]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [38]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [39]:
ranked_indices

array([0, 2, 1], dtype=int64)

In [40]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [41]:
doc_path = "Medical_book.pdf"

In [42]:
!pip install pypdf





In [36]:
!pip install langchain_community

Collecting langchain_community
  Using cached langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain_community)
  Downloading langchain_core-0.3.35-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.18 (from langchain_community)
  Using cached langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.38-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting requests<3,>=2 (from langchain_community)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting PyYAML>=5.3 (from langchain_community)
  Using cached PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.11.12-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain_community)
  Using cached tenacity-9.0.0-py3-none-any.whl.me

  You can safely remove it manually.
  You can safely remove it manually.


In [43]:
from langchain_community.document_loaders import PyPDFLoader

In [44]:
loader=PyPDFLoader(doc_path)

In [45]:
docs=loader.load()

In [46]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [47]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [48]:
chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'

In [50]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [51]:
HF_TOKEN="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"

In [52]:
from dotenv import load_dotenv
import os
load_dotenv()
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

In [53]:
!pip install chromadb

Collecting importlib-metadata>=4.6 (from build>=1.0.3->chromadb)
  Downloading importlib_metadata-8.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading importlib_metadata-8.5.0-py3-none-any.whl (26 kB)
Installing collected packages: importlib-metadata
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib_metadata 8.6.1
    Uninstalling importlib_metadata-8.6.1:
      Successfully uninstalled importlib_metadata-8.6.1
Successfully installed importlib-metadata-8.5.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
crewai 0.102.0 requires litellm==1.60.2, but you have litellm 1.63.11 which is incompatible.


In [60]:
from langchain.vectorstores import Chroma

In [None]:
vectorstore = Chroma.from_documents(chunks,embeddings)

In [None]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000028996010100>, search_kwargs={'k': 3})

In [None]:
!pip install rank_bm25





In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k =  3

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

# Mixing vector search and keyword search for Hybrid search

## hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [63]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [64]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-win_amd64.whl.metadata (5.9 kB)
Collecting torch<3,>=2.0 (from bitsandbytes)
  Downloading torch-2.6.0-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch<3,>=2.0->bitsandbytes)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch<3,>=2.0->bitsandbytes)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting sympy==1.13.1 (from torch<3,>=2.0->bitsandbytes)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch<3,>=2.0->bitsandbytes)
  Using cached MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading bitsandbytes-0.45.2-py3-none-win_amd64.whl (69.1 MB)
   ---------------------------------------- 0.0/69.1 MB ? eta -:--:--
   -- ------------------------------------- 4.7/69.1 MB 22.0 MB/s eta 0:00:03
   ------ --------------------------------- 10.5/69.1 MB 25.2 MB/s eta 

In [65]:
!pip install accelerate

Collecting accelerate
  Using cached accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Collecting safetensors>=0.4.3 (from accelerate)
  Using cached safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Using cached accelerate-1.3.0-py3-none-any.whl (336 kB)
Using cached safetensors-0.5.2-cp38-abi3-win_amd64.whl (303 kB)
Installing collected packages: safetensors, accelerate
Successfully installed accelerate-1.3.0 safetensors-0.5.2


In [76]:
from langchain.chains import RetrievalQA

In [77]:
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

In [78]:
llm = GoogleGenerativeAI(model = "models/gemini-pro")

In [79]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [91]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [98]:
response1 = normal_chain.invoke("what is Abdominal ultrasound")

In [99]:
response1

{'query': 'what is Abdominal ultrasound',
 'result': 'Abdominal ultrasound is a diagnostic imaging technique that uses high-frequency sound waves to create images of the organs and structures in the abdomen.'}

In [100]:
print(response1.get("result"))

Abdominal ultrasound is a diagnostic imaging technique that uses high-frequency sound waves to create images of the organs and structures in the abdomen.


In [101]:
response2 = hybrid_chain.invoke("what is Abdominal ultrasound")

In [102]:
response2

{'query': 'what is Abdominal ultrasound',
 'result': 'Abdominal ultrasound is a diagnostic imaging technique that uses sound waves to create images of the abdominal organs.'}

In [103]:
print(response2.get("result"))

Abdominal ultrasound is a diagnostic imaging technique that uses sound waves to create images of the abdominal organs.
