# Import necessary libraries

In [10]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Load Environment Variables (for OpenAI API Key)

In [None]:
load_dotenv(find_dotenv())

if not os.getenv("OPENAI_API_KEY"):
    print("OPENAI_API_KEY not found in .env file. Please set it.")


In [16]:
DATA_PATH = "../data/proc_email.csv" 
FAISS_INDEX_PATH = "../faiss_index" 
EMAIL_COUNT = 60 
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 20
EMBEDDING_MODEL_NAME = "text-embedding-3-small" 
LLM_MODEL_NAME = "gpt-4o-mini"

# Dataset Preparation

In [17]:
print(f"Loading data from: {DATA_PATH}")

loader = CSVLoader(file_path=DATA_PATH,
                   encoding="utf8",
                   source_column="to_index") 

documents = loader.load()
print(f"Loaded {len(documents)} total emails.") 

if len(documents) >= EMAIL_COUNT:
    selected_documents = documents[:EMAIL_COUNT]
    print(f"Selected {len(selected_documents)} emails for processing.")
else:
    selected_documents = documents
    print(f"Warning: Fewer than {EMAIL_COUNT} emails available. Using all {len(selected_documents)} loaded emails.")

if selected_documents:
    print("\nSample document content (from 'to_index' column):")
    
    print(selected_documents[0].page_content[:500] + "...")
    print(f"\nSample document metadata (source): {selected_documents[0].metadata['source'][:100]}...")
else:
    print("No documents were loaded or selected. Exiting.")
    
    
print("\n--- Text Splitting ---")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
all_splits = text_splitter.split_documents(selected_documents)

print(f"Split {len(selected_documents)} documents into {len(all_splits)} chunks.")
if all_splits:
    print(f"Sample split chunk: {all_splits[0].page_content[:200]}...")
else:
    print("No splits created.")


Loading data from: ../data/proc_email.csv
Loaded 99 total emails.
Selected 60 emails for processing.

Sample document content (from 'to_index' column):
To: frozenset({'robert.walker@enron.com'})
From: frozenset({'daren.farmer@enron.com'})
X-To: Robert Walker
X-From: Daren J Farmer
content: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2391

EB3211F
to_index: From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2391

EB3211F...

Sample document metadata (source): From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-23...

--- Text Splitting ---
Split 60 documents into 188 chunks.
Sample split chunk: To: frozenset({'robert.walker@enron.com'})
From: frozenset({'daren.farmer@enron.com'})
X-To: Robert Walker
X-From: Daren J Farmer
content: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2...


# Embedding

In [None]:
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)
print(f"Initialized OpenAIEmbeddings with model: {EMBEDDING_MODEL_NAME}")

if os.path.exists(FAISS_INDEX_PATH) and os.listdir(FAISS_INDEX_PATH):
    print(f"Loading existing FAISS index from: {FAISS_INDEX_PATH}")
    try:
        vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        print("FAISS index loaded successfully.")
    except Exception as e:
        print(f"Error loading FAISS index: {e}. Recreating index.")
        if all_splits:
            print("Creating new FAISS index...")
            vector_store = FAISS.from_documents(documents=all_splits, embedding=embeddings)
            vector_store.save_local(FAISS_INDEX_PATH)
            print(f"FAISS index created and saved to: {FAISS_INDEX_PATH}")
        else:
            print("No document splits to create index from. Cannot proceed.")
            
else:
    if all_splits:
        print("Creating new FAISS index...")
        vector_store = FAISS.from_documents(documents=all_splits, embedding=embeddings)
        os.makedirs(FAISS_INDEX_PATH, exist_ok=True) 
        vector_store.save_local(FAISS_INDEX_PATH)
        print(f"FAISS index created and saved to: {FAISS_INDEX_PATH}")
    else:
        print("No document splits to create index from. Cannot proceed with FAISS creation.")
        

Initialized OpenAIEmbeddings with model: text-embedding-3-small
Creating new FAISS index...
FAISS index created and saved to: ../faiss_index


# Retriever and QA Chain Setup

In [28]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) 
print("Retriever created from FAISS vector store (for QA chain).")
sample_query_for_similarity_search = "What is the ENA contact information for Daren Farmer?"
print(f"\nTesting FAISS similarity_search_with_score with query: '{sample_query_for_similarity_search}'")
docs_and_scores = vector_store.similarity_search_with_score(
    query=sample_query_for_similarity_search,
    k=3 
)

print(f"Retrieved {len(docs_and_scores)} documents with scores.")
for i, (doc, score) in enumerate(docs_and_scores):
    print(f"--- Document {i+1} (Score: {score:.4f}) ---") 
    print(doc.page_content[:300] + "...")
    print(f"(Source: {doc.metadata.get('source', 'N/A')[:100]}...)")
    print("-" * 20)

llm = ChatOpenAI(model_name=LLM_MODEL_NAME, temperature=0.7)
print(f"\nInitialized ChatOpenAI with model: {LLM_MODEL_NAME}")

prompt_template = """You are an Email Wizard's Assistant. Use the following pieces of context, which are past emails, to answer the question at the end.
If you don't know the answer from the context, just say that you don't know, don't try to make up an answer.
Provide a concise answer, and if possible, quote relevant parts from the retrieved emails.
If the question is a general greeting or not answerable from the emails, respond politely.
Context:
{context}
Question: {question}
Helpful Answer:"""
QA_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever, 
    return_source_documents=True, 
    chain_type_kwargs={"prompt": QA_PROMPT}
)
print("RAG QA chain created.")


Retriever created from FAISS vector store (for QA chain).

Testing FAISS similarity_search_with_score with query: 'What is the ENA contact information for Daren Farmer?'
Retrieved 3 documents with scores.
--- Document 1 (Score: 0.6297) ---
To: frozenset({'robert.walker@enron.com'})
From: frozenset({'daren.farmer@enron.com'})
X-To: Robert Walker
X-From: Daren J Farmer
content: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2391

EB3211F
to_index: From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-8...
(Source: From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-23...)
--------------------
--- Document 2 (Score: 0.9362) ---
Group-

Earlier this week, we met Tana Jones (Senior Legal Specialist) and Leslie 
Hansen (Legal Counsel) with ENA (servicing ENW) to discuss the process for 
executing NDAs.  Leslie will serve as in-house legal counsel for EMS while 
Tana will be the Senior Legal Assistant for all doc

# Testing RAG System

In [33]:
queries = [
        "What is the phone number for Daren Farmer at ENA?",
        "What is the discussion about i2 Technologies and Tax?",
        "Who should be contacted about the MDEA Agreement scheduling?",
        "Tell me about the Vitro/Termination agreement.",
        "What's the status of my project?", 
        "Hi, how are you?" 
    ]

for query in queries:
    print(f"\n\nQuery: {query}")
    try:
        result = qa_chain.invoke({"query": query})
        print("\nLLM Answer:")
        print(result["result"])
    except Exception as e:
        print(f"Error processing query '{query}': {e}")
        



Query: What is the phone number for Daren Farmer at ENA?

LLM Answer:
The phone number for Daren Farmer at ENA is 713-853-6905.


Query: What is the discussion about i2 Technologies and Tax?

LLM Answer:
The discussion about i2 Technologies and Tax revolves around the approval of contracts related to Enron Credit.com. Tana Jones is seeking confirmation that Erica's legal advice, which expresses no problem with the agreement, should apply to not just the Non-Disclosure Agreement (NDA) in question, but also to all NDAs and the entire business originating from the Houston-based EnronCredit.com team. Tana states, "Erica's advice needs to apply not only to this NDA, but probably all of the NDA's that are going to originate from the Houston based EnronCredit.com business team." Additionally, Mark Taylor emphasizes the need to confirm whether Tax has no issues with these contracts being signed in the U.S.


Query: Who should be contacted about the MDEA Agreement scheduling?

LLM Answer:
The