# RAG with FOREIGNERS DIVISION Documents

- Extract the documents (from website or from google drive)
- partition the pdf files using Unstructured library
- Chunk the pdf files into documents
- Embed the documents using an open source embedings (or cloudflare)
- store the embeddings in a vector database (MongoDB, FAISS, ChromaDB)
- Try querying the database with a new document and create a simple interface to query the database
- Create a RAG chatbot to query the database using Streamlit, Gradio, and Flask

In [None]:
!pip install "unstructured[all-docs]"
!pip install gdown

### Extract the documents (from website or from google drive)

In [None]:
from bs4 import BeautifulSoup
import requests
import urllib.parse

# source of data
foreigners_division = "https://www.mha.gov.in/en/divisionofmha/foreigners-division"
main_content_id = "block-mhanew-content" # use Inspect and find the main content id for the page (hardcoded part)
tree_links_levels = 3 # how deep you want to go into the website (hardcoded part or estimated check the website)

# simple function to extract links in proper format
def extract_links(links):
    extracted_links = []
    for link in links:
        if link.get('href') is None:
            continue
        if link.get('href').startswith('http'):
            extracted_links.append(link.get('href'))
        elif link.get('href').startswith('/'):
            extracted_links.append(urllib.parse.urljoin(foreigners_division, link.get('href')))
    return extracted_links

# extract the links from the main div only as the given id
def extract_links_from_div(url: str, div_id: str):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        div = soup.find(id=div_id)
        links = div.find_all('a')
        links = extract_links(links)
        print(f"Links extracted successfully from {url}")
        return links
    except Exception as e:
        print(f"Error while extracting links from {url} - {e}")
        return []

# extract links for the given levels only
for i in range(tree_links_levels):
    if i == 0:
        links = extract_links_from_div(foreigners_division, main_content_id)
    else:
        temp_links = []
        for link in links:
            if link.endswith('.pdf'):
                continue
            temp_links.extend(extract_links_from_div(link, main_content_id))
        links = temp_links

# drop all duplicates and links that are not pdf file
links = [link for link in set(links) if link.endswith('.pdf')]

print(f"Total links extracted: {len(links)}")


In [None]:
import os

# download the pdfs and store them in the same directory
directory = "pdfs"
os.makedirs(directory, exist_ok=True)

for link in links:
    try:
        filename = os.path.join(directory, link.split('/')[-1])
        response = requests.get(link)

        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded {filename}")
    except Exception as e:
        print(f"Error while downloading {link} - {e}")

print("Download completed")

In [None]:
# OR load the pdfs from a google drive folder

import gdown

google_dirve_folder = "https://drive.google.com/drive/folders/1scxrYUke_pVz2I1ZEQWqsPSJFiw7e5HA?usp=sharing"
files_with_path = gdown.download_folder(google_dirve_folder)
print(f"Downloaded {len(files_with_path)} files from google drive folder")


### partition the pdf files using Unstructured library

I have checked the pdf files and they are not very consistant, so I will use the pdf partition function with auto mode so if the text is not extractable then OCR will be performed, so you need to install Full Unstrcutured installation from [here](https://docs.unstructured.io/open-source/installation/full-installation)

Install Unstructured library using pip
```bash
pip install "unstructured[all-docs]"
# or for pdf files only
pip install "unstructured[pdf]"

# then install system dependencies
# tesseract
sudo apt install tesseract-ocr # or brew install tesseract
sudo apt install libtesseract-dev # or brew install tesseract-lang # you will need to set the TESSDATA_PREFIX environment variable to the directory containing the tessdata directory
sudo apt install tesseract-ocr-hin # for support of hindi language

# poppler
sudo apt install poppler-utils # or brew install poppler

# libreoffice
sudo apt install libreoffice # or brew install --cask libreoffice

# pandoc, check the latest version from https://github.com/jgm/pandoc/releases
wget https://github.com/jgm/pandoc/releases/download/3.2.1/pandoc-3.2.1-1-amd64.deb
sudo dpkg -i pandoc-3.2.1-1-amd64.deb

```

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!sudo apt install tesseract-ocr-hin
!sudo apt install poppler-utils
!sudo apt install libreoffice
!wget https://github.com/jgm/pandoc/releases/download/3.2.1/pandoc-3.2.1-1-amd64.deb
!sudo dpkg -i pandoc-3.2.1-1-amd64.deb

In [3]:
from unstructured.partition.auto import partition
import os

pdfs_directory = "Foreigners Division MHA"

def preprocess_pdfs(directory):
  elements = []
  for root, _, files in os.walk(directory):
    for file in files:
        if file.endswith(".pdf"):
            elems = partition(filename=os.path.join(root,file))
            elements.extend(elems)
  return elements

final_elements = preprocess_pdfs(pdfs_directory)

print(f"Extracted {len(final_elements)} elements from the pdf")

Extracted 6950 elements from the pdf


In [None]:
!pip install sacremoses transformers # for translation model dependencies

In [5]:
import re
from unstructured.cleaners.core import (
    clean, remove_punctuation, clean_non_ascii_chars,
    clean_ordered_bullets, group_broken_paragraphs
)

para_split_re = re.compile(r"(\s*\n\s*){3}")

def clean_text(x):
  # Handle the case when x.text is empty or whitespace
  if x.text.strip():  # Check if text is not empty after stripping whitespace
    x.text = clean_ordered_bullets(x.text)
    x.text = group_broken_paragraphs(x.text,paragraph_split=para_split_re)
    x.text = clean(x.text, extra_whitespace=True, bullets=True,
                      dashes=True, trailing_punctuation=True, lowercase=True)
    x.text = remove_punctuation(x.text)
    x.text = clean_non_ascii_chars(x.text)

for element in final_elements:
  clean_text(element)

In [6]:
from collections import Counter

display(Counter(type(element) for element in final_elements))

Counter({unstructured.documents.elements.NarrativeText: 2045,
         unstructured.documents.elements.Text: 2019,
         unstructured.documents.elements.Title: 1877,
         unstructured.documents.elements.ListItem: 788,
         unstructured.documents.elements.Footer: 182,
         unstructured.documents.elements.Header: 39})

In [7]:
# chunk by title
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(final_elements)
print(f"Chunks created: {len(chunks)}")

Chunks created: 1577


In [8]:
print(chunks[0].metadata.to_dict())

{'file_directory': 'Foreigners Division MHA', 'filename': 'ForeignVisa_030214[1].pdf', 'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2024-07-14T19:34:42', 'page_number': 1, 'orig_elements': 'eJzllE1v1DAQhv+KlXNbbMf56g0JIZAAIVG4rFarSTxJXCV2lDhtl8J/Z5xklwrKkVNvyeN34vnKu3uMsMMerT8YHV2zCHieKp0lRaXzBDEtYi641lKmpeJFmkcXLOrRgwYPpH+MKudGbSx4nJb3Do5u9ocWTdN6IipLM4rZ8L3RviUaJ7EiOjhjfYjb7YTM+BW/YGkmrvj+gp2BkLFciSyS7HmyBhGJpuPksQ+VfDYP2H0ZoMLoJx3UpsODNiNW3o3HIHjrRkrS4jixN+bOTMZZ9vHd62gTW+jxieybmeDAYy6F2on91aDrk9Afh0UIw9CZCjx959V23IFtZmiW1uwitE20X+jkD73Tpja4NF1yqS55dinUjSiuY3WtZIgeKPJg577EkVQiVOHxwS8RCZdScC65iFnNfrDG3eFowyCZq5mx2gDrjTWTH4+BtK5HBnUNhsqtf1euT5VbXVXMdKycTUfzbNgt0IFtWjY60MziPdPYtYb5FpnM6djOQN+mDFRI9tSFr5ZagI0bzXfUNyFdyvvPLeNVDEmqapGqUiZYxCLTOi51EqNO0xT+25blatspla8bdAYp38Bpx/4CS8gL2rGnU/1Ai/Q+FP3MMEuhsBBVBUpAnmc8y+qcx1rztNRCJfq/DVNm62TidBvmCSSiOPuD+gdZg17QOKe5vKVKyBvIE+Yq3DmxETu6nf527+i5CXaxZBMsY3MJZhcC3cQq12/a1WBI16OmAjpGzYDzC3iPVgN5UcBP9+gTjOGCO1ydYf8LOYn+cg==

In [12]:
!pip install langchain
!pip install langchain_community
!pip install sentence-transformers
!pip install faiss-cpu
!pip install chromadb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [13]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

documents = []
for element in chunks:
    metadata = element.metadata.to_dict()
    documents.append(Document(page_content=element.text, metadata=metadata))

db = FAISS.from_documents(documents, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
db.save_local("db")

In [14]:
for document in documents:
  document.metadata["languages"] = "en"

In [23]:
# embeddings with cloudflare with chroma
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings

account_id = "4da66dac8f0c0483794586300c5ccc66"
api_token = "BZwp7xKEITzRnORUTRIzMBBz_TZzRMgx26DZSqgI"
model_name =  "@cf/baai/bge-small-en-v1.5"

cf = CloudflareWorkersAIEmbeddings(
    account_id=account_id,
    api_token=api_token,
    model_name=model_name
)

from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=cf,
    collection_name="fd_data_hello",
    persist_directory="./local_chroma",
    
)



  vectorstore = Chroma.from_documents(


In [38]:
vectorstore.persist()

  warn_deprecated(


In [24]:
%pip install --quiet langchain langchain-mongodb langchain-openai pymongo pypdf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [25]:
# cloudflare embedings and store index into mongodb
import getpass, os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient

ATLAS_CONNECTION_STRING = "mongodb+srv://mohammedbinbasri:rOYYdFRTG9ErECH8@cluster0.cyqutk5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# Connect to your Atlas cluster
client = MongoClient(ATLAS_CONNECTION_STRING)

# Define collection and index name
db_name = "langchain_db"
collection_name = "test"
atlas_collection = client[db_name][collection_name]
vector_search_index = "vector_index"

In [26]:
# Create the vector store
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents = documents,
    embedding = cf,
    collection = atlas_collection,
    index_name = vector_search_index
)

In [28]:
retrieve = vector_store.as_retriever(k=5)

In [29]:
query = "foreigners division"
results = retrieve(query)
for result in results:
    print(result.metadata)
    print(result.text)

TypeError: 'VectorStoreRetriever' object is not callable