In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from OprFuncs import data_infer, extract_code, extract_questions
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import re
import pandas as pd
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_ollama import OllamaEmbeddings, OllamaLLM  # ✅ Updated imports


class DataAnalyzer:
    def __init__(self, dataframe, llm, retriever):
        self.dataframe = dataframe
        self.llm = llm
        self.retriever = retriever
        self.data_info = data_infer(dataframe)
        self.memory = []

    def analysis_data(self):
        data_info = self.data_info
        
        analysis_prompt = ''' 
        You are a data analyst. You have been provided with a dataset about {data_info}.
        Here is the dataset structure:
        {data_info}

        To enhance your analysis, you have access to a knowledge base containing relevant domain knowledge, best practices, and analytical rules.

        Please analyze the data by retrieving relevant insights from the knowledge base and provide a structured analysis in the following format: 

        1. *Key Trends and Patterns*:
        - [Describe the key trends and patterns in the data based on both the dataset and retrieved knowledge].

        2. *Anomalies or Outliers*:
        - [Identify any anomalies or outliers in the data, incorporating relevant insights from the knowledge base].

        Ensure your analysis is specific, data-driven, and incorporates retrieved domain knowledge for deeper insights.
        '''

        # Retrieve relevant knowledge from FAISS
        retrieved_docs = self.retriever.get_relevant_documents(query=data_info)
        retrieved_knowledge = "\n".join([doc.page_content for doc in retrieved_docs])

        # Define the prompt template
        analysis_template = PromptTemplate(
            input_variables=["data_info", "retrieved_knowledge"],
            template=analysis_prompt
        )

        # Create a chain for analysis data
        analysis_chain = LLMChain(llm=self.llm, prompt=analysis_template)

        # Run the analysis chain with retrieved knowledge
        analysis = analysis_chain.invoke({"data_info": data_info, "retrieved_knowledge": retrieved_knowledge})

        # Ensure that `analysis` is a string before adding it to memory
        if isinstance(analysis, dict) and "text" in analysis:
            analysis = analysis["text"]
        else:
            analysis = str(analysis)

        formatted_analysis_prompt = analysis_prompt.format(data_info=data_info, retrieved_knowledge=retrieved_knowledge)
        self.memory.append(HumanMessage(content=formatted_analysis_prompt))
        self.memory.append(AIMessage(content=analysis))

        # Return the analysis
        return analysis 


from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking


# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")  # ✅ Updated class
    
    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH, 
            embedding_model, 
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)
    
    retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 5})
    llm = OllamaLLM(model="llama2")  # ✅ Updated class
    
    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")
    
    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()
    
    documents = load_analysis_rules_from_memory(pdf_content)
    
    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)
    
    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)
    
    # ✅ Pass retriever properly
    analyzer = DataAnalyzer(df, llm=llm, retriever=retrievalQA.retriever)
    
    # Perform data analysis and generate query
    analysis_result = analyzer.analysis_data()
    print("\n📊 Analysis Result:")
    print(analysis_result)

    # Send a specific question to RAG Agent
    question = "What is the average sales?"
    result = retrievalQA.invoke({"query": question})
    print("\n🔍 Answer:")
    print(result['result'])



🔄 Loading existing FAISS index...


  retrieved_docs = self.retriever.get_relevant_documents(query=data_info)
  analysis_chain = LLMChain(llm=self.llm, prompt=analysis_template)



📊 Analysis Result:

1. Key Trends and Patterns:
Based on the provided dataset and knowledge base, we can identify several key trends and patterns in the sales data:

a) Region-wise Sales Distribution: The dataset shows a clear pattern of higher sales in regions with a larger population and lower sales in rural areas. This trend is consistent across all three years, indicating that there is a strong correlation between population density and sales.

b) Seasonality: There is a noticeable seasonal pattern in the data, with higher sales during the summer months (June to August) and lower sales during the winter months (December to February). This trend suggests that sales are influenced by weather patterns and seasonal events.

c) Product Popularity: The dataset shows that certain products are consistently more popular than others across different regions. For example, product A is consistently the top-selling product in region 1, while product B is the top-selling product in region 2. Th

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from OprFuncs import data_infer, extract_code, extract_questions
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import re
import pandas as pd
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking


class DataAnalyzer:
    def __init__(self, dataframe, llm, retriever):
        self.dataframe = dataframe
        self.llm = llm
        self.retriever = retriever
        self.data_info = data_infer(dataframe)
        self.memory = []

    def analysis_data(self):
        data_info = self.data_info
        
        analysis_prompt = ''' 
        You are a data analyst. You have been provided with a dataset about {data_info}.
        Here is the dataset structure:
        {data_info}

        To enhance your analysis, you have access to a knowledge base containing relevant domain knowledge, best practices, and analytical rules.

        Please analyze the data by retrieving relevant insights from the knowledge base and provide a structured analysis in the following format: 

        1. *Key Trends and Patterns*:
        - [Describe the key trends and patterns in the data based on both the dataset and retrieved knowledge].

        2. *Anomalies or Outliers*:
        - [Identify any anomalies or outliers in the data, incorporating relevant insights from the knowledge base].

        Ensure your analysis is specific, data-driven, and incorporates retrieved domain knowledge for deeper insights.
        '''

        # Retrieve relevant knowledge from FAISS
        retrieved_docs = self.retriever.invoke(input=data_info)  # Fixed: Use `input` instead of `query`
        retrieved_knowledge = "\n".join([doc.page_content for doc in retrieved_docs])

        # Define the prompt template
        analysis_template = PromptTemplate(
            input_variables=["data_info", "retrieved_knowledge"],
            template=analysis_prompt
        )

        # Create a chain for analysis data using RunnableSequence
        analysis_chain = analysis_template | self.llm  # Updated approach

        # Run the analysis chain with retrieved knowledge
        analysis = analysis_chain.invoke({"data_info": data_info, "retrieved_knowledge": retrieved_knowledge})

        # Ensure that `analysis` is a string before adding it to memory
        if isinstance(analysis, dict) and "text" in analysis:
            analysis = analysis["text"]
        else:
            analysis = str(analysis)

        formatted_analysis_prompt = analysis_prompt.format(data_info=data_info, retrieved_knowledge=retrieved_knowledge)
        self.memory.append(HumanMessage(content=formatted_analysis_prompt))
        self.memory.append(AIMessage(content=analysis))
        
        self.retriever.vectorstore.add_documents([Document(page_content=analysis)])

        # Return the analysis
        return analysis 


# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")
    
    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH, 
            embedding_model, 
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)
    
    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    llm = OllamaLLM(model="llama2")
    
    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")
    
    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()
    
    documents = load_analysis_rules_from_memory(pdf_content)
    
    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)
    
    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)
    
    # ✅ Pass retriever properly
    analyzer = DataAnalyzer(df, llm=llm, retriever=retrievalQA.retriever)
    
    # Perform data analysis and generate query
    analysis_result = analyzer.analysis_data()
    print("\n📊 Analysis Result:")
    print(analysis_result)

    # Send a specific question to RAG Agent
    question = "What is the average sales?"
    result = retrievalQA.invoke({"query": question})
    print("\n🔍 Answer:")
    print(result['result'])


🔄 Loading existing FAISS index...

📊 Analysis Result:
1. Key Trends and Patterns:

Based on both the provided dataset and the knowledge base, several trends and patterns can be observed in the sales data:

* Region-wise sales performance: The dataset reveals that regions 2 and 3 have consistently outperformed region 1 in terms of sales revenue. This pattern is consistent across all years, indicating a sustained competitive advantage for these regions.
* Seasonality: The dataset shows a clear seasonal pattern in sales, with higher sales volumes during the winter months (December to February) and lower volumes during the summer months (June to August). This trend is common across most industries and can be attributed to factors like weather, holidays, and consumer behavior.
* Shift in market share: The dataset suggests that region 1 has experienced a decline in market share over the past two years, while regions 2 and 3 have gained significantly. This trend may indicate a change in cons

In [6]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from OprFuncs import data_infer, extract_code, extract_questions
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import re
import pandas as pd
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking


class DataAnalyzer:
    def __init__(self, dataframe, llm, retriever):
        self.dataframe = dataframe
        self.llm = llm
        self.retriever = retriever
        self.data_info = data_infer(dataframe)
        self.memory = []

    def analysis_data(self):
        data_info = self.data_info

        analysis_prompt = ''' 
        You are a data analyst. You have been provided with a dataset about {data_info}.
        Here is the dataset structure:
        {data_info}

        To enhance your analysis, you have access to a knowledge base containing relevant domain knowledge, best practices, and analytical rules.

        Please analyze the data by retrieving relevant insights from the knowledge base and provide a structured analysis in the following format: 

        1. *Key Trends and Patterns*:
        - [Describe the key trends and patterns in the data based on both the dataset and retrieved knowledge].

        2. *Anomalies or Outliers*:
        - [Identify any anomalies or outliers in the data, incorporating relevant insights from the knowledge base].

        Ensure your analysis is specific, data-driven, and incorporates retrieved domain knowledge for deeper insights.
        '''

        # Retrieve relevant knowledge from FAISS
        retrieved_docs = self.retriever.invoke(data_info)
        retrieved_knowledge = "\n".join([doc.page_content for doc in retrieved_docs]) if retrieved_docs else "No relevant legal rules found."

        # Define the prompt template
        analysis_template = PromptTemplate(
            input_variables=["data_info", "retrieved_knowledge"],
            template=analysis_prompt
        )

        # Create a chain for analysis data using RunnableSequence
        analysis_chain = analysis_template | self.llm

        # Run the analysis chain with retrieved knowledge
        analysis = analysis_chain.invoke({"data_info": data_info, "retrieved_knowledge": retrieved_knowledge})

        # Ensure that `analysis` is a string before adding it to memory
        if isinstance(analysis, dict) and "text" in analysis:
            analysis = analysis["text"]
        else:
            analysis = str(analysis)

        formatted_analysis_prompt = analysis_prompt.format(data_info=data_info, retrieved_knowledge=retrieved_knowledge)
        self.memory.append(HumanMessage(content=formatted_analysis_prompt))
        self.memory.append(AIMessage(content=analysis))

        # ✅ Ensure FAISS is updated with new knowledge
        self.retriever.vectorstore.add_documents([Document(page_content=analysis)])

        # Return the analysis
        return analysis


# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")

    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH,
            embedding_model,
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)

    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    llm = OllamaLLM(model="llama2")

    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")

    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()

    documents = load_analysis_rules_from_memory(pdf_content)

    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)

    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)

    # ✅ Pass retriever properly
    analyzer = DataAnalyzer(df, llm=llm, retriever=retrievalQA.retriever)

    # Perform data analysis and generate query
    analysis_result = analyzer.analysis_data()
    print("\n📊 Analysis Result:")
    print(analysis_result)

    # Send a specific question to RAG Agent
    question = "What is the average sales?"
    result = retrievalQA.invoke({"query": question})
    print("\n🔍 Answer:")
    print(result['result'])



🔄 Loading existing FAISS index...

📊 Analysis Result:

1. Key Trends and Patterns:

Based on the provided dataset and knowledge base, we can identify several key trends and patterns in the sales data:

* Region-wise distribution: The sales data reveals that there is a significant variation in sales across different regions. While some regions have consistently high sales, others have low sales. This suggests that there may be regional differences in customer preferences or market conditions.
* Sales growth over time: The dataset shows a steady increase in sales over the past 3 years. This trend indicates that the company's strategy is effective in growing its business and increasing revenue.
* Seasonality: There are noticeable seasonal fluctuations in sales, with higher sales during the summer months and lower sales during winter. This suggests that customer buying habits are influenced by seasonal factors such as holidays and weather conditions.
* Customer behavior: The data indicate

In [7]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from OprFuncs import data_infer, extract_code, extract_questions
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import re
import pandas as pd
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking


class DataAnalyzer:
    def __init__(self, dataframe, llm, retriever):
        self.dataframe = dataframe
        self.llm = llm
        self.retriever = retriever
        self.data_info = data_infer(dataframe)
        self.memory = []

    def analysis_data(self):
        data_info = self.data_info

        analysis_prompt = ''' 
        You are a data analyst. You have been provided with a dataset about {data_info}.
        Here is the dataset structure:
        {data_info}

        To enhance your analysis, you have access to a knowledge base containing relevant domain knowledge, best practices, and analytical rules.

        Please analyze the data by retrieving relevant insights from the knowledge base and provide a structured analysis in the following format: 

        1. *Key Trends and Patterns*:
        - [Describe the key trends and patterns in the data based on both the dataset and retrieved knowledge].

        2. *Anomalies or Outliers*:
        - [Identify any anomalies or outliers in the data, incorporating relevant insights from the knowledge base].

        Ensure your analysis is specific, data-driven, and incorporates retrieved domain knowledge for deeper insights.
        '''

        # Retrieve relevant knowledge from FAISS
        retrieved_docs = self.retriever.invoke(data_info)
        retrieved_knowledge = "\n".join([doc.page_content for doc in retrieved_docs]) if retrieved_docs else "No relevant legal rules found."

        # Define the prompt template
        analysis_template = PromptTemplate(
            input_variables=["data_info", "retrieved_knowledge"],
            template=analysis_prompt
        )

        # Create a chain for analysis data using RunnableSequence
        analysis_chain = analysis_template | self.llm

        # Run the analysis chain with retrieved knowledge
        analysis = analysis_chain.invoke({"data_info": data_info, "retrieved_knowledge": retrieved_knowledge})

        # Ensure that `analysis` is a string before adding it to memory
        if isinstance(analysis, dict) and "text" in analysis:
            analysis = analysis["text"]
        else:
            analysis = str(analysis)

        formatted_analysis_prompt = analysis_prompt.format(data_info=data_info, retrieved_knowledge=retrieved_knowledge)
        self.memory.append(HumanMessage(content=formatted_analysis_prompt))
        self.memory.append(AIMessage(content=analysis))

        # ✅ Ensure FAISS is updated with new knowledge
        self.retriever.vectorstore.add_documents([Document(page_content=analysis)])

        # Return the analysis
        return analysis


# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")

    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH,
            embedding_model,
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)

    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    llm = OllamaLLM(model="llama2")

    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")

    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()

    documents = load_analysis_rules_from_memory(pdf_content)

    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)

    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)

    # ✅ Pass retriever properly
    analyzer = DataAnalyzer(df, llm=llm, retriever=retrievalQA.retriever)

    # Perform data analysis and generate query
    analysis_result = analyzer.analysis_data()
    print("\n📊 Analysis Result:")
    print(analysis_result)

    # Send a specific question to RAG Agent
    question = "What is the average sales?"
    result = retrievalQA.invoke({"query": question})
    print("\n🔍 Answer:")
    print(result['result'])


🔄 Loading existing FAISS index...

📊 Analysis Result:
1. Key Trends and Patterns:
After analyzing the dataset and consulting the knowledge base, we can identify several key trends and patterns in the sales data:

* Regions with higher population density tend to have higher sales volumes. This is consistent with the idea that more people in a given area are likely to result in more sales opportunities.
* There is a positive correlation between sales in different regions, suggesting that strong sales in one region can lead to increased sales in other regions.
* The distribution of sales across districts within regions is uneven, with some districts experiencing significantly higher sales volumes than others. This could indicate a lack of even distribution of resources or opportunities within certain regions.
* There are instances of anomalous sales patterns, such as sudden spikes or dips in sales volume, which may be indicative of changes in consumer behavior or other external factors.


In [None]:
# Use ConversationalRetrievalChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain_community.llms import Ollama
import pandas as pd
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking
from DataAnalyzer import DataAnalyzer  # Importing DataAnalyzer

# 🔹 Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load analysis rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system and set up FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")

    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH,
            embedding_model,
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)

    retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 5})
    llm = Ollama(model="llama2")

    # 🔹 Create an interactive Agent using ConversationalRetrievalChain
    rag_agent = ConversationalRetrievalChain.from_llm(llm, retriever=retriever)

    return rag_agent, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # 🔹 Load analysis rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")

    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()

    documents = load_analysis_rules_from_memory(pdf_content)

    # 🔹 Train RAG model and create an Agent
    rag_agent, llm = train_rag_system(documents)

    # 🔹 Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df = load_csv(csv_file_path)

    # 🔹 Create a DataAnalyzer instance and analyze data
    analyzer = DataAnalyzer(df, llm=rag_agent)
    analysis_result, questions = analyzer.analysis_data()

    print("\n📊 Analysis Result:")
    print(analysis_result)

    # 🔹 Pass extracted questions to the Agent for answers
    for question in questions:
        print(f"\n❓ Question: {question}")
        result = rag_agent.invoke({"question": question})
        print("🔍 Answer:")
        print(result['answer'])
