# Import necessary libraries

In [10]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Load Environment Variables (for OpenAI API Key)

In [None]:
load_dotenv(find_dotenv())

if not os.getenv("OPENAI_API_KEY"):
    print("OPENAI_API_KEY not found in .env file. Please set it.")


In [16]:
DATA_PATH = "../data/proc_email.csv" 
FAISS_INDEX_PATH = "../faiss_index" 
EMAIL_COUNT = 60 
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 20
EMBEDDING_MODEL_NAME = "text-embedding-3-small" 
LLM_MODEL_NAME = "gpt-4o-mini"

# Dataset Preparation

In [17]:
print(f"Loading data from: {DATA_PATH}")

loader = CSVLoader(file_path=DATA_PATH,
                   encoding="utf8",
                   source_column="to_index") 

documents = loader.load()
print(f"Loaded {len(documents)} total emails.") 

if len(documents) >= EMAIL_COUNT:
    selected_documents = documents[:EMAIL_COUNT]
    print(f"Selected {len(selected_documents)} emails for processing.")
else:
    selected_documents = documents
    print(f"Warning: Fewer than {EMAIL_COUNT} emails available. Using all {len(selected_documents)} loaded emails.")

if selected_documents:
    print("\nSample document content (from 'to_index' column):")
    
    print(selected_documents[0].page_content[:500] + "...")
    print(f"\nSample document metadata (source): {selected_documents[0].metadata['source'][:100]}...")
else:
    print("No documents were loaded or selected. Exiting.")
    
    
print("\n--- Text Splitting ---")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)
all_splits = text_splitter.split_documents(selected_documents)

print(f"Split {len(selected_documents)} documents into {len(all_splits)} chunks.")
if all_splits:
    print(f"Sample split chunk: {all_splits[0].page_content[:200]}...")
else:
    print("No splits created.")


Loading data from: ../data/proc_email.csv
Loaded 99 total emails.
Selected 60 emails for processing.

Sample document content (from 'to_index' column):
To: frozenset({'robert.walker@enron.com'})
From: frozenset({'daren.farmer@enron.com'})
X-To: Robert Walker
X-From: Daren J Farmer
content: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2391

EB3211F
to_index: From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2391

EB3211F...

Sample document metadata (source): From Daren J Farmer to Robert Walker: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-23...

--- Text Splitting ---
Split 60 documents into 188 chunks.
Sample split chunk: To: frozenset({'robert.walker@enron.com'})
From: frozenset({'daren.farmer@enron.com'})
X-To: Robert Walker
X-From: Daren J Farmer
content: ENA Contact

Daren Farmer
Phone # 713-853-6905
Fax# 713-646-2...


# Embedding

In [None]:
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)
print(f"Initialized OpenAIEmbeddings with model: {EMBEDDING_MODEL_NAME}")

if os.path.exists(FAISS_INDEX_PATH) and os.listdir(FAISS_INDEX_PATH):
    print(f"Loading existing FAISS index from: {FAISS_INDEX_PATH}")
    try:
        vector_store = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        print("FAISS index loaded successfully.")
    except Exception as e:
        print(f"Error loading FAISS index: {e}. Recreating index.")
        if all_splits:
            print("Creating new FAISS index...")
            vector_store = FAISS.from_documents(documents=all_splits, embedding=embeddings)
            vector_store.save_local(FAISS_INDEX_PATH)
            print(f"FAISS index created and saved to: {FAISS_INDEX_PATH}")
        else:
            print("No document splits to create index from. Cannot proceed.")
            
else:
    if all_splits:
        print("Creating new FAISS index...")
        vector_store = FAISS.from_documents(documents=all_splits, embedding=embeddings)
        os.makedirs(FAISS_INDEX_PATH, exist_ok=True) 
        vector_store.save_local(FAISS_INDEX_PATH)
        print(f"FAISS index created and saved to: {FAISS_INDEX_PATH}")
    else:
        print("No document splits to create index from. Cannot proceed with FAISS creation.")
        

Initialized OpenAIEmbeddings with model: text-embedding-3-small
Creating new FAISS index...
FAISS index created and saved to: ../faiss_index
