In [28]:
import os
from openai import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
import re

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"
# Load and process the PDF
file_path = r"C:\Users\AkhilNarasimhaS\Downloads\DUMMY_TABLE_TO_TEXT.pdf"
pdf_reader = PyPDFLoader(file_path)
documents = pdf_reader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Create vector store
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(documents=chunks, embedding=embeddings)

# Define a more specific prompt
AGGREGATION_PROMPT = PromptTemplate.from_template("""
Given the following conversation and a followup question, rephrase the followup question to be a standalone question.
If the question requires any calculations or aggregations, please perform them and show your work.
Make sure to filter the data based on any specified conditions (e.g., region, person, division, territory).
Provide a step-by-step breakdown of your calculations.

Chat History: {chat_history}
Follow up Input: {question}

Standalone question with calculations (if needed):
""")

# Initialize the ConversationalRetrievalChain
llm = ChatOpenAI(temperature=0)
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=db.as_retriever(),
    condense_question_prompt=AGGREGATION_PROMPT,
    return_source_documents=True,
    verbose=False
)

def extract_numbers_with_context(text):
    """Extract numbers along with their preceding words for context."""
    # Updated pattern to match only two groups: context and the number
    pattern = r'([A-Za-z\s]+):\s*(\d+(?:\.\d+)?)'
    matches = re.findall(pattern, text)
    return matches

def post_process_aggregation(question, answer):
    """Post-process the answer for aggregation questions with improved filtering."""
    lower_question = question.lower()
    
    # Extract numbers and context
    numbers_with_context = extract_numbers_with_context(answer)
    
    # Extract the filter condition from the question
    filter_words = ["region", "person", "division", "territory", "representative"]
    filter_condition = next((word for word in filter_words if word in lower_question), None)
    
    if filter_condition:
        filter_value_match = re.search(fr"{filter_condition}\s+(\w+)", lower_question, re.IGNORECASE)
        if filter_value_match:
            filter_value = filter_value_match.group(1).lower()
            filtered_numbers = [float(num) for context, num in numbers_with_context 
                                if filter_value in context.lower()]
            
            if filtered_numbers:
                total = sum(filtered_numbers)
                return f"The total sales for {filter_condition} '{filter_value.capitalize()}' is {total}. Details: {answer}"
            else:
                return f"No specific data found for {filter_condition} '{filter_value.capitalize()}'. Raw answer: {answer}"
    
    # If no filter condition or filtered results are found
    if numbers_with_context:
        total = sum(float(num) for _, num in numbers_with_context)
        return f"The total of all sales mentioned is {total}. This may not be specific to your query. Details: {answer}"
    
    return answer

chat_history = []

def ask_question(query):
    try:
        result = qa({"question": query, "chat_history": chat_history})
        processed_answer = post_process_aggregation(query, result['answer'])
        
        # Append the new question-answer pair to the chat history as a tuple
        chat_history.append((query, processed_answer))
        
        return processed_answer
    except Exception as e:
        return f"An error occurred while processing your question: {str(e)}"

# Example usage
queries = [
    "who is the  top perfomer in terms of sales"
]

for query in queries:
    print(f"Question: {query}")
    answer = ask_question(query)
    print(f"Answer: {answer}\n")


Question: who is the  top perfomer in terms of sales
Answer: Ben Cohen, a Spec C in New York, is the top performer in terms of sales. He made 4 year-to-date calls, resulting in 35 sales and 4 samples.

