In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
#sample documents
documents = [
    "Anime is great",
    "Where is hell have you been",
    "Anime is the greatest media ever created",
    "Movie is a media used for generation"
]



In [3]:
query = "Best media ever"

In [4]:
import re

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text



In [5]:
preprocessed_doc = [preprocess_text(doc) for doc in documents]

In [6]:
preprocessed_doc

['anime is great',
 'where is hell have you been',
 'anime is the greatest media ever created',
 'movie is a media used for generation']

In [7]:
preprocessed_query = preprocess_text(query)

In [8]:
preprocessed_query

'best media ever'

In [9]:
vector = TfidfVectorizer()

In [10]:
X = vector.fit_transform(preprocessed_doc)

In [11]:
X.toarray()

array([[0.5728925 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.72664149, 0.        , 0.        , 0.        ,
        0.37919167, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.43551105, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.43551105, 0.43551105,
        0.22726773, 0.        , 0.        , 0.        , 0.        ,
        0.43551105, 0.43551105],
       [0.33570696, 0.        , 0.42580171, 0.42580171, 0.        ,
        0.        , 0.        , 0.42580171, 0.        , 0.        ,
        0.222201  , 0.33570696, 0.        , 0.42580171, 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.45203489,
        0.45203489, 0.        , 0.        , 0.        , 0.        ,
        0.23589056, 0.3563895 , 0.45203489, 0.        , 0.45203489,
        0.        , 0.        ]])

In [12]:
query_embedding = vector.transform([preprocessed_query])

In [13]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.78528828, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6191303 , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

using tfidf instead of hugging face transformer. so that the vector representation is of sparse vector instead of dense vector


In [14]:
similarities = cosine_similarity(X, query_embedding)

In [15]:
similarities

array([[0.        ],
       [0.        ],
       [0.54222344],
       [0.22065154]])

In [16]:
np.argsort(similarities, axis = 0)

array([[0],
       [1],
       [3],
       [2]])

In [17]:
ranked_indices = np.argsort(similarities, axis = 0)[::-1].flatten()

In [18]:
ranked_indices

array([2, 3, 1, 0])

In [19]:
ranked_documents = [documents[i] for i in ranked_indices]

In [20]:
ranked_documents

['Anime is the greatest media ever created',
 'Movie is a media used for generation',
 'Where is hell have you been',
 'Anime is great']

In [21]:
for i, doc in enumerate(ranked_documents):
  print(f"Rank {i} : {doc}")

Rank 0 : Anime is the greatest media ever created
Rank 1 : Movie is a media used for generation
Rank 2 : Where is hell have you been
Rank 3 : Anime is great


In [22]:
docs_path = "/content/Deep Learning Techniques for Time Series Forecasting_ A Comprehensive Guide _ by Huntress Elle _ Medium.pdf"

In [23]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/298.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [24]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.11-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.11 (from langchain_community)
  Downloading langchain-0.3.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.24 (from langchain_community)
  Downloading langchain_core-0.3.24-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [26]:
from langchain_community.document_loaders import PyPDFLoader

In [27]:
loader = PyPDFLoader(docs_path)

In [29]:
docs = loader.load()

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [33]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 30)

In [35]:
chunks = splitter.split_documents(docs)

In [36]:
from langchain.embeddings import HuggingFaceEmbeddings

In [39]:
embeddings = HuggingFaceEmbeddings( model_name = "BAAI/bge-base-en-v1.5")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [40]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.2-py3

In [41]:
from langchain.vectorstores import Chroma

In [42]:
vectorstore = Chroma.from_documents(chunks, embeddings)

In [43]:
vectorstore_retriever = vectorstore.as_retriever(search_kwargs = {"k":3})

In [45]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [46]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [47]:
keyword_retriver = BM25Retriever.from_documents(chunks)

based on the similarity search how many sentences to fetch

In [48]:
keyword_retriver.k = 3

In [50]:
retriver = EnsembleRetriever(retrievers = [vectorstore_retriever, keyword_retriver], weights = [0.3,0.7])

In [51]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [52]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [53]:
!pip install accelerate

