In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [3]:
query="keyword-based search"

In [4]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [5]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [6]:
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [7]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)

Preprocessed Documents:
this is a list which containig sample documents
keywords are important for keywordbased search
document analysis involves extracting keywords
keywordbased search relies on sparse embeddings


In [8]:
print("Preprocessed Query:")
print(query)

Preprocessed Query:
keyword-based search


In [9]:
preprocessed_query = preprocess_text(query)

In [10]:
preprocessed_query

'keywordbased search'

In [11]:
vector=TfidfVectorizer()

In [12]:
X=vector.fit_transform(preprocess_documents)

In [13]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [14]:
X.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [15]:
query_embedding=vector.transform([preprocessed_query])

In [16]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [17]:
similarities = cosine_similarity(X, query_embedding)

In [18]:
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [19]:
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [20]:
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()


In [21]:
ranked_documents = [documents[i] for i in ranked_indices]

In [22]:
#Ranking
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [23]:
ranked_indices


array([1, 3, 2, 0])

In [24]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


In [25]:
query

'keyword-based search'

In [26]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [27]:
#https://huggingface.co/sentence-transformers

In [28]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [29]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [30]:
# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)

In [31]:
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [32]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [33]:
ranked_indices

array([0, 2, 1])

In [34]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [40]:
doc_path="/content/countingtheinvisible_pakistan.pdf"

In [41]:
!pip install pypdf



In [42]:
!pip install langchain_community



In [43]:
from langchain_community.document_loaders import PyPDFLoader

In [44]:
loader=PyPDFLoader(doc_path)

In [45]:
docs=loader.load()

In [46]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [47]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)

In [48]:
chunks = splitter.split_documents(docs)

In [49]:
chunks

[Document(page_content='Technical Report – Pakistan', metadata={'source': '/content/countingtheinvisible_pakistan.pdf', 'page': 0}),
 Document(page_content='plan-international.org  Counting  the Invisible  – Technical  Report  Pakistan  1  \n \nAcronyms    3 \nGlossary    4 \nAcknowledgement    7 \nSection 1:  Intro duction and rationale    8', metadata={'source': '/content/countingtheinvisible_pakistan.pdf', 'page': 1}),
 Document(page_content='1.1 Introduction  8 \n1.2 Background and rationale  8 \n1.3 Research objectives  9 \n1.4 Methodology  12 \n1.4.1 Research sample  12 \n1.4.2 Characteristics and identities of girls  12', metadata={'source': '/content/countingtheinvisible_pakistan.pdf', 'page': 1}),
 Document(page_content='1.4.3 Field data collection  18 \n1.4.4 Quantitative survey questions  19 \n1.4.5 Qualitative survey quest ions 19 \n1.4.6 Creative and reflective workshops  20 \n1.4.7 Research task teams  20', metadata={'source': '/content/countingtheinvisible_pakistan.pdf',

In [50]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

In [51]:
HF_TOKEN="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"

In [52]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [53]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/559.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m337.9/559.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from c

In [54]:
from langchain.vectorstores import Chroma

In [56]:
vectorstore=Chroma.from_documents(chunks,embeddings)

In [57]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [58]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7b8976b407f0>, search_kwargs={'k': 3})

In [59]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [60]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [61]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [62]:
keyword_retriever.k =  3

In [63]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

# Mixing vector search and keyword search for Hybrid search

## hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [64]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [65]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (4

In [66]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/309.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [67]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [68]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [69]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [70]:
tokenizer = initialize_tokenizer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [71]:
model = load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [72]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

In [73]:
llm = HuggingFacePipeline(pipeline=pipeline)

  warn_deprecated(


In [74]:
from langchain.chains import RetrievalQA

In [75]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [76]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [84]:
response1 = normal_chain.invoke("What are the challenges that adolescent girls face in terms of violence and mobility?")

In [85]:
response1

{'query': 'What are the challenges that adolescent girls face in terms of violence and mobility?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\ndiscussing  violence  in focus  groups,  girls talked  of the restrictions  on their mobility  as \nviolence.  Girls reported  not feeling safe in their homes,  in public  places  nor in their\n\nsafeguard  them  from violence :  \n \n [Interviewer:]   What  types  of violence  are girls more  exposed  to? \n[Girls:]    Restriction  on girls’ mobility .\n\ngirls  in public  and private  spaces,  including  the risks  that adolescent  girls  face \ncompared  to boys,  their  perceptions  and experiences  of reporting  violence,  and their\n\nQuestion: What are the challenges that adolescent girls face in terms of violence and mobility?\nHelpful Answer: According to focus groups, girls reported not feeling safe in b

In [86]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

discussing  violence  in focus  groups,  girls talked  of the restrictions  on their mobility  as 
violence.  Girls reported  not feeling safe in their homes,  in public  places  nor in their

safeguard  them  from violence :  
 
 [Interviewer:]   What  types  of violence  are girls more  exposed  to? 
[Girls:]    Restriction  on girls’ mobility .

girls  in public  and private  spaces,  including  the risks  that adolescent  girls  face 
compared  to boys,  their  perceptions  and experiences  of reporting  violence,  and their

Question: What are the challenges that adolescent girls face in terms of violence and mobility?
Helpful Answer: According to focus groups, girls reported not feeling safe in both their homes and public places. They identified restrictions on their mobility as a form of violence they are more exposed

In [87]:
response2 = hybrid_chain.invoke("What is commmunity leaders?")

In [82]:
response2

{'query': 'What is commmunity leaders?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n7.8 Experience and perceptions of violence  81 \n7.9 What adolescent boys think of violence against women and girls – who is responsible?  82\n\n7.9 What  adolescent  boys  think  of violence  again st women  and girls  – who is \nresponsible?   \n \nWhile boys acknowledged that girls are exposed to and experience violence in the home and in\n\nsafeguard  them  from violence :  \n \n [Interviewer:]   What  types  of violence  are girls more  exposed  to? \n[Girls:]    Restriction  on girls’ mobility .\n\nbe capable leaders, it is men who are the preferred  leaders  in the community .  \n \n“Women cannot become a leader while men are l eaders.” ( Jats adolescent girl )\n\n7.3 The importance of the community as an enabling environment  78 \n7.4 Women community leaders as r

In [83]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

7.8 Experience and perceptions of violence  81 
7.9 What adolescent boys think of violence against women and girls – who is responsible?  82

7.9 What  adolescent  boys  think  of violence  again st women  and girls  – who is 
responsible?   
 
While boys acknowledged that girls are exposed to and experience violence in the home and in

safeguard  them  from violence :  
 
 [Interviewer:]   What  types  of violence  are girls more  exposed  to? 
[Girls:]    Restriction  on girls’ mobility .

be capable leaders, it is men who are the preferred  leaders  in the community .  
 
“Women cannot become a leader while men are l eaders.” ( Jats adolescent girl )

7.3 The importance of the community as an enabling environment  78 
7.4 Women community leaders as role models  78 
7.5 Gender norms  78 
7.6 Education and pathways to empow