In [1]:
from OprFuncs import data_infer

In [2]:
def analysis_data(df,agent):
    data_info = data_infer(df)

    # Prompt and Chain for Analysis Data
    analysis_prompt = ''' 
    You are a data analyst. You are provided with a dataset about {data_info}.
    Here is the dataset structure:
    {data_info}

    Please analyze the data and provide insights in the following format:

    1. *Key Trends and Patterns*:
    - [Describe the key trends and patterns in the data].

    2. *Anomalies or Outliers*:
    - [Identify any anomalies or outliers in the data].

    Ensure your analysis is specific, data-driven, and actionable.

    '''
    formatted_analysis_prompt = analysis_prompt.format(data_info=data_info)
    print(type(formatted_analysis_prompt))

    analysis = agent.invoke({"query":formatted_analysis_prompt})


    # Return the analysis
    return analysis

In [3]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
import pandas as pd
import fitz  # PyMuPDF for reading PDF files
import os  # For file path checking

# Define FAISS database path
FAISS_DB_PATH = "faiss_index"

# 1️⃣ Load rules from PDF memory
def load_analysis_rules_from_memory(pdf_content):
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    documents = [Document(page_content=page.get_text()) for page in doc]
    return documents

# 2️⃣ Train RAG system with FAISS
def train_rag_system(documents):
    """Train or load the RAG model with FAISS to avoid recomputation."""
    embedding_model = OllamaEmbeddings(model="llama2")
    
    if os.path.exists(FAISS_DB_PATH):
        print("\n🔄 Loading existing FAISS index...")
        vector_db = FAISS.load_local(
            FAISS_DB_PATH, 
            embedding_model, 
            allow_dangerous_deserialization=True
        )
    else:
        print("\n🛠️ Generating new embeddings and saving FAISS index...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        texts = text_splitter.split_documents(documents)
        vector_db = FAISS.from_documents(texts, embedding_model)
        vector_db.save_local(FAISS_DB_PATH)
    
    retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 5})
    llm = Ollama(model="llama2")
    
    retrievalQA = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    return retrievalQA, llm

# 3️⃣ Load CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)

# 🚀 Main execution
if __name__ == "__main__":
    # Load rules from PDF file
    pdf_file_path = "storying.pdf"
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{pdf_file_path}' was not found!")
    
    with open(pdf_file_path, "rb") as file:
        pdf_content = file.read()
    
    documents = load_analysis_rules_from_memory(pdf_content)
    
    # Train RAG model
    retrievalQA, llm = train_rag_system(documents)
    
    # Load CSV data
    csv_file_path = "Regions.csv"
    if not os.path.exists(csv_file_path):
        raise FileNotFoundError(f"🚨 Error: The file '{csv_file_path}' was not found!")

    df1 = load_csv(csv_file_path)
        
    # Perform data analysis and generate query
analysis_result = analysis_data(df=df1,agent=retrievalQA)
print("\n📊 Analysis Result:")
print(analysis_result)

  embedding_model = OllamaEmbeddings(model="llama2")



🔄 Loading existing FAISS index...


  llm = Ollama(model="llama2")


<class 'str'>

📊 Analysis Result:
{'query': " \n    You are a data analyst. You are provided with a dataset about <class 'pandas.core.frame.DataFrame'>\nRangeIndex: 109 entries, 0 to 108\nData columns (total 3 columns):\n #   Column          Non-Null Count  Dtype \n---  ------          --------------  ----- \n 0   region_id       109 non-null    int64 \n 1   sales_district  109 non-null    object\n 2   sales_region    109 non-null    object\ndtypes: int64(1), object(2)\nmemory usage: 2.7+ KB\n.\n    Here is the dataset structure:\n    <class 'pandas.core.frame.DataFrame'>\nRangeIndex: 109 entries, 0 to 108\nData columns (total 3 columns):\n #   Column          Non-Null Count  Dtype \n---  ------          --------------  ----- \n 0   region_id       109 non-null    int64 \n 1   sales_district  109 non-null    object\n 2   sales_region    109 non-null    object\ndtypes: int64(1), object(2)\nmemory usage: 2.7+ KB\n\n\n    Please analyze the data and provide insights in the following forma