In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
import numpy as np

In [None]:
# Load PDFs from directory
loader = PyPDFDirectoryLoader("./data")
docs = loader.load()

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into chunks
text_split = text_splitter.split_documents(docs)

# Initialize the embedding model
embedding_model = OllamaEmbeddings(model="llama3.2:3b")

# Initialize the vector database
vector_db = FAISS.from_documents(text_split, embedding_model)

  embedding_model = OllamaEmbeddings(model="llama3.2:3b")


In [None]:
# Embedding using Huggingface
# generating text embeddings
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", # generating high-quality embeddings for English text
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

In [None]:
np.array(huggingface_embedding.embed_query(text_split[0].page_content))
np.array(huggingface_embedding.embed_query(text_split[0].page_content)).shape

(384,)

In [58]:
# Initialize the vector database
db1 = FAISS.from_documents(text_split[:120], huggingface_embedding)


In [59]:
# Query the vector database
query = "WHAT IS HEALTH INSURANCE COVERAGE?"

search = db1.similarity_search(query)
search[0].page_content

'2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthcare.'

In [60]:
# Initialize the retriever
retriever = db1.as_retriever(search_type="similarity",search_kwargs={"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000018705A37050>, search_kwargs={'k': 3})

In [None]:
import os

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [62]:
from langchain_community.llms import HuggingFaceHub

# Initialize the HuggingFaceHub model
hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)

query="What is the health insurance coverage?"
hf.invoke(query)



"What is the health insurance coverage? Health insurance coverage is a type of insurance that covers the cost of medical and surgical expenses incurred by the insured. Here are some key aspects of health insurance coverage:\n\n1. **Types of Coverage:**\n   - **Indemnity Plans:** These plans allow you to visit any healthcare provider, and the insurance company will reimburse you for a portion of the cost.\n   - **Managed Care Plans:** These plans have networks of healthcare providers, and you typically need to choose a primary care physician (PCP) who will coordinate your care. Examples include HMOs (Health Maintenance Organizations) and PPOs (Preferred Provider Organizations).\n   - **High-Deductible Health Plans (HDHPs):** These plans have lower premiums but higher deductibles. They often come with a Health Savings Account (HSA) for tax-advantaged savings.\n\n2. **What's Covered:**\n   - **Preventive Care:** Regular check-ups, immunizations, screenings, and other services to prevent i

In [44]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
"""

In [45]:
prompt=PromptTemplate(
    template=prompt_template,
    input_variables=["context","question"]
)


In [49]:
# Create the RetrievalQA chain
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [50]:
# Define the query
query = "DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"

In [51]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])




Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
import numpy as np
import pandas as pd
from Main import analysis_data  

# ✅ 1. تحميل ملف CSV وتحويله إلى نصوص
file_path = "Regions.csv"  # ضع اسم ملفك هنا
df = pd.read_csv(file_path)  # قراءة ملف CSV

# ✅ 2. تحويل كل صف إلى نص
documents = []
for index, row in df.iterrows():
    text = " | ".join([f"{col}: {str(row[col])}" for col in df.columns])  # تحويل كل صف إلى نص
    documents.append(text)
    
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into chunks
text_split = text_splitter.split_documents(docs)

# Initialize the embedding model
embedding_model = OllamaEmbeddings(model="llama3.2:3b")

# Initialize the vector database
vector_db = FAISS.from_documents(text_split, embedding_model)

# Embedding using Huggingface
# generating text embeddings
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", # generating high-quality embeddings for English text
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

np.array(huggingface_embedding.embed_query(text_split[0].page_content))
np.array(huggingface_embedding.embed_query(text_split[0].page_content)).shape

db1 = FAISS.from_documents(text_split[:120], huggingface_embedding)

# Query the vector database
search = db1.similarity_search(query)
search[0].page_content

# Initialize the retriever
retriever = db1.as_retriever(search_type="similarity",search_kwargs={"k":3})
retriever

import os


HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

from langchain_community.llms import HuggingFaceHub

# Initialize the HuggingFaceHub model
hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)

query="Analyze the dataset"
hf.invoke(query)

prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
"""

prompt=PromptTemplate(
    template=prompt_template,
    input_variables=["context","question"]
)

# Create the RetrievalQA chain

retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Define the query
query = analysis_data(df, retrievalQA)

# Call the QA chain with our query.

result = retrievalQA.invoke({"query": query})
print(result['result'])