In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
import numpy as np

In [2]:
# Load PDFs from directory
loader = PyPDFDirectoryLoader("./data")
docs = loader.load()

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into chunks
text_split = text_splitter.split_documents(docs)

# Initialize the embedding model
embedding_model = OllamaEmbeddings(model="llama3.2:3b")

# Initialize the vector database
vector_db = FAISS.from_documents(text_split, embedding_model)

  embedding_model = OllamaEmbeddings(model="llama3.2:3b")


In [3]:
# Embedding using Huggingface
# generating text embeddings
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", # generating high-quality embeddings for English text
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)




In [4]:
np.array(huggingface_embedding.embed_query(text_split[0].page_content))
np.array(huggingface_embedding.embed_query(text_split[0].page_content)).shape

(384,)

In [5]:
# Initialize the vector database
db1 = FAISS.from_documents(text_split[:120], huggingface_embedding)


In [6]:
# Query the vector database
query = "WHAT IS HEALTH INSURANCE COVERAGE?"

search = db1.similarity_search(query)
search[0].page_content

'2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthcare.'

In [7]:
# Initialize the retriever
retriever = db1.as_retriever(search_type="similarity",search_kwargs={"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000027C69C1F140>, search_kwargs={'k': 3})

In [15]:
import os
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [16]:
from langchain_community.llms import HuggingFaceHub

# Initialize the HuggingFaceHub model
hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)

query="What is the health insurance coverage?"
hf.invoke(query)

ValidationError: 1 validation error for HuggingFaceHub
  Value error, Did not find huggingfacehub_api_token, please add an environment variable `HUGGINGFACEHUB_API_TOKEN` which contains it, or pass `huggingfacehub_api_token` as a named parameter. [type=value_error, input_value={'repo_id': 'mistralai/Mi...acehub_api_token': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/value_error

In [None]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
"""

In [None]:
prompt=PromptTemplate(
    template=prompt_template,
    input_variables=["context","question"]
)


In [None]:
# Create the RetrievalQA chain
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
# Define the query
query = "DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022"

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])




Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import numpy as np
import pandas as pd
from DataAnalyzer import DataAnalyzer
import os

# Set your Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

# Load the dataset
file_path = "Regions.csv"
df = pd.read_csv(file_path)

# Create LangChain Document objects
documents = [
    Document(page_content=" | ".join([f"{col}: {str(row[col])}" for col in df.columns]))
    for _, row in df.iterrows()
]

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into chunks
text_split = text_splitter.split_documents(documents)

# Initialize the embedding model
embedding_model = OllamaEmbeddings(model="llama3.2:3b")

# Initialize the vector database
vector_db = FAISS.from_documents(text_split, embedding_model)

# Embedding using Huggingface
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # High-quality embeddings for English text
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Create the FAISS vector store
db1 = FAISS.from_documents(text_split[:120], huggingface_embedding)

# Initialize the retriever
retriever = db1.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Initialize the HuggingFaceHub model
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature": 0.1, "max_length": 500}
)

# Define the prompt template
prompt_template = """
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question: {question}

Helpful Answers:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create the RetrievalQA chain
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Initialize the DataAnalyzer
analyzer = DataAnalyzer(df, llm=hf)  # Use hf instead of retrievalQA

# Define the query
query = analyzer.analysis_data()

# Call the QA chain with our query
result = retrievalQA.invoke({"query": query})
print(result['result'])




Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

region_id: 19 | sales_district: Seattle | sales_region: North West

region_id: 23 | sales_district: Salem | sales_region: North West

region_id: 22 | sales_district: Portland | sales_region: North West
Question: 
        You are a data analyst. You are provided with a dataset about <class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   region_id       109 non-null    int64 
 1   sales_district  109 non-null    object
 2   sales_region    109 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.7+ KB

        Here is the dataset structure:
        <class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain_community.llms import HuggingFaceHub
import pandas as pd
import os

# تأكد من وجود التوكن الخاص بك على Hugging Face
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

# تحميل بيانات CSV
file_path = "Regions.csv"
df = pd.read_csv(file_path)

# إنشاء مستندات من البيانات
documents = [
    Document(page_content=" | ".join([f"{col}: {str(row[col])}" for col in df.columns]))
    for _, row in df.iterrows()
]

# تقسيم النصوص إلى أجزاء صغيرة
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_split = text_splitter.split_documents(documents)

# تهيئة نموذج التضمين
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# إنشاء قاعدة بيانات المتجهات
vector_db = FAISS.from_documents(text_split, embedding_model)

# إعداد مسترجع المعلومات
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# إعداد نموذج اللغة من Hugging Face
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature": 0.1, "max_length": 500}
)

# إعداد قالب التوجيه
prompt_template = """
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context.

{context}
Question: {question}

Helpful Answers:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# إعداد سلسلة استرجاع الأسئلة والإجابات
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# تحديد الاستعلام لتحليل البيانات
query = "Analyze this dataset based on statistical trends and patterns."

# تنفيذ التحليل
result = retrievalQA.invoke(query)

# طباعة النتيجة
print("Analysis Result:")
print(result.get('result', 'No result found'))



Analysis Result:

Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context.

region_id: 105 | sales_district: Victoria | sales_region: Canada West

region_id: 42 | sales_district: San Francisco | sales_region: Central West

region_id: 31 | sales_district: San Francisco | sales_region: Central West
Question: Analyze this dataset based on statistical trends and patterns.

Helpful Answers:
- The sales_district "San Francisco" appears twice, both times in the sales_region "Central West".
- The sales_region "Canada West" appears only once, associated with the sales_district "Victoria".
- The sales_region "Central West" appears twice, associated with the sales_district "San Francisco" both times.
- There are no duplicate region_id values in the dataset.
- The dataset contains 3 unique region_id values, 2 unique sales_district values, and 2 unique sales_region values.


In [None]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate  # ✅ Fixed missing import
import pandas as pd
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking
from DataAnalyzer import DataAnalyzer  # Importing the class from another file

# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")
    
    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH, 
            embedding_model, 
            allow_dangerous_deserialization=True  # ✅ Fix applied
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)
    
    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    llm = Ollama(model="llama2")
    
    # Define custom prompt
    prompt_template = """
    Use the following piece of context to answer the question asked.
    Please try to provide the answer only based on the context.

    {context}
    Question: {question}

    Helpful Answer:
    """
    
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt}  # ✅ Added prompt to chain
    )
    
    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")
    
    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()
    
    documents = load_analysis_rules_from_memory(pdf_content)
    
    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)
    
    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)
    
    # Create DataAnalyzer instance
    analyzer = DataAnalyzer(df, llm=llm)
    
    # Perform data analysis
    query = analyzer.analysis_data()
    
    # Use RetrievalQA to answer the query
    result = retrievalQA.invoke({"query": query})

    # Display the final result
    print("\n🔍 Final Analysis Result:")
    print(result['result'])

    # Display source documents
    print("\n📚 Source Documents:")
    for doc in result['source_documents']:
        print(doc.page_content)


🔄 Loading existing FAISS index...

🔍 Final Analysis Result:
Based on the context provided, here are some potential actionable insights and recommendations that could be derived from the analysis:

1. Investigate the reasons behind the higher average sales amounts in the sales district: Is there a specific strategy or approach being used in the districts that is leading to higher sales? Are there any differences in the customer base or market conditions that could explain the difference?
2. Analyze the segmented data to identify patterns and trends: By breaking down the data into different segments based on regions, districts, or other factors, it may be possible to identify unique opportunities or challenges for each segment. For example, are there any specific regions where sales are underperforming, and if so, what could be the reasons?
3. Develop targeted marketing campaigns: Based on the analysis, it may be worth identifying which regions or districts are underperforming and devel