In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.llms import CTransformers
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import json

In [2]:
#!pip install langchain-community

In [3]:
def load_json(folder_path):
    documents = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)

                if isinstance(data, list):
                    for entry in data:
                        text = entry.get("markdown") or entry.get("text") or entry.get("content") or json.dumps(entry)
                        url = entry.get("url", filepath)
                        documents.append(Document(page_content=text, metadata={"source": filepath, "url": url}))
                elif isinstance(data, dict):
                    text = data.get("markdown") or data.get("text") or data.get("content") or json.dumps(data)
                    url = data.get("url", filepath)
                    documents.append(Document(page_content=text, metadata={"source": filepath, "url": url}))
    
    return documents


In [4]:
extracted_data = load_json("data/")
print(f"Loaded {len(extracted_data)} documents")

Loaded 163 documents


In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 6807


In [6]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()
query_result = embeddings.embed_query("Hello world")
print(f"Query Embedding Length: {len(query_result)}")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Query Embedding Length: 384


In [7]:
from langchain.vectorstores import FAISS

# Build FAISS vector store from text chunks
docsearch = FAISS.from_documents(
    documents=text_chunks,
    embedding=embeddings
)

# Perform similarity search on a query
query = "What are allergies?"
docs = docsearch.similarity_search(query, k=3)

# Display the top 3 most relevant documents
for i, doc in enumerate(docs, 1):
    print(f"\nDocument {i}:\n{doc.page_content[:500]}...")
    print(f"Metadata: {doc.metadata}")



Document 1:
These may include vaccines for measles, mumps, and rubella (MMR), diphtheria, tetanus, pertussis (whooping cough), polio, varicella (chickenpox), meningococcal disease, seasonal flu, and others. Pre-travel vaccines and medications When travelling in this destination, you might be at risk for...
Metadata: {'source': 'data/scrape_results_20250526_124734.json', 'url': 'https://travel.gc.ca/destinations/india'}

Document 2:
dust, and other flying particles. Wear the protective equipment that is intended and recommended for your particular task. Seat belts Safety glasses or goggles Protective clothing, headgear, and/or footgear Safety harnesses, etc. ---- Workplace Safety Tips: Sources ---- (1) (2) (3) (4) (5) (6) (7)...
Metadata: {'source': 'data/scrape_results_20250526_140158.json', 'url': 'https://www.atlantictraining.com/safety-tips/workplace-safety-tips.php'}

Document 3:
a well-fitting mask, especially: if you’re sick and need to be around others when you’re at large indo

In [8]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [9]:
from langchain.llms import CTransformers

llm = CTransformers(
    model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",  
    model_type="llama",                             
    config={
        'max_new_tokens': 512,
        'temperature': 0.8
    }
)


In [14]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
    chain_type_kwargs=chain_type_kwargs,
    return_source_documents=True  
)



In [17]:
while True:
    try:
        user_input = input("\nInput your question (type 'exit' to stop): ")
        if user_input.lower() == 'exit':
            print("\nExiting chat... Goodbye!")
            break

        result = qa_chain.invoke({"query": user_input})
        response = result["result"]

        print(f"\nUser Question: {user_input}")
        print(f"Bot Response: {response}\n")

        # Show source document URLs
        if "source_documents" in result:
            for doc in result["source_documents"]:
                url = doc.metadata.get("url") or doc.metadata.get("source", "Unknown")
                print(f"Source: {url}")


    except KeyboardInterrupt:
        print("\nExiting chat...")
        break



User Question: How can I keep my belongings secure on a crowded bus or metro?
Bot Response: Consider using anti-theft backpacks or purses that are designed specifically for public transportation security. These bags typically have slash-proof fabric, reinforced zippers, and other features to help keep your belongings safe from thieves and pickpockets. Additionally, try to keep your valuables close to you at all times and be aware of your surroundings to avoid losing or having them stolen.

Source: https://www.littleexplainers.com/public-transport-safety/

User Question: How do I identify and avoid risky areas when walking through a city?
Bot Response: You can identify risky areas in a city by being aware of your surroundings and paying attention to local crime statistics. Look for areas with high crime rates, especially at night or in isolated locations. Additionally, you can ask locals or shop owners for advice on safe routes and areas. Finally, use online resources such as crime map