In [1]:
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama.embeddings import OllamaEmbeddings

In [2]:
df = pd.read_csv('../data/filtered_enron_emails.csv')
sampled_df = df.sample(frac=0.005, random_state=47).reset_index(drop=True)

In [3]:
#Filter emails where body length >= 500 characters
filtered_docs = sampled_df['body'].dropna()
filtered_docs = filtered_docs[filtered_docs.str.len() >= 500].tolist()

# Now filtered_docs contains each email body (≥ 500 chars) as a single document
print(f"Total documents with len >= 500: {len(filtered_docs)}")

Total documents with len >= 500: 863


In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # slight overlap preserves context
    separators=["\n\n", "\n", ".", " ", ""],  # smart fallback if no newlines
)

docs = splitter.create_documents(filtered_docs)
chunks = [doc.page_content for doc in docs]

In [6]:
len(chunks)

7600

In [15]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests

def get_ollama_embedding(text, model="mxbai-embed-large"):
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        json={"model": model, "prompt": text}
    )
    return response.json()["embedding"]

def embed_chunks_parallel(chunks, max_workers=10):
    embeddings = [None] * len(chunks)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(get_ollama_embedding, chunk): idx
            for idx, chunk in enumerate(chunks)
        }
        for future in tqdm(as_completed(futures), total=len(chunks), desc="Embedding chunks"):
            idx = futures[future]
            try:
                embeddings[idx] = future.result()
            except Exception as e:
                print(f"❌ Chunk {idx} failed: {e}")
    return embeddings

In [13]:
class OllamaEmbeddingFunction(Embeddings):
    def embed_documents(self, texts):
        embeddings = []
        for text in tqdm(texts, desc="Embedding via Ollama"):
            response = requests.post(
                "http://localhost:11434/api/embeddings",
                json={"model": "mxbai-embed-large", "prompt": text}
            )
            embeddings.append(response.json()["embedding"])
        return embeddings

    def embed_query(self, text):
        response = requests.post(
            "http://localhost:11434/api/embeddings",
            json={"model": "mxbai-embed-large", "prompt": text}
        )
        return response.json()["embedding"]

In [None]:
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Step 1: Create documents
documents = [Document(page_content=chunk) for chunk in chunks]

# Step 2: Embed in parallel
chunk_embeddings = embed_chunks_parallel(chunks, max_workers=10)  # Adjust workers for your CPU/GPU


## Below two lines of code have craeted the error

# # Step 3: Store in FAISS
# faiss_index = FAISS.from_embeddings(embeddings=chunk_embeddings, documents=documents)

# # Step 4: Save to disk (root folder)
# faiss_index.save_local("../faiss_index")

Embedding chunks: 100%|██████████| 7600/7600 [32:12<00:00,  3.93it/s]  


TypeError: FAISS.from_embeddings() missing 2 required positional arguments: 'text_embeddings' and 'embedding'

In [None]:
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
import numpy as np
import faiss 

# Step 1: Convert to float32 numpy array
embedding_vectors = np.array(chunk_embeddings).astype("float32")

# Step 2: Create FAISS index
dimension = embedding_vectors.shape[1]  # usually 384 or 768 depending on your model
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embedding_vectors)

# Step 3: Wrap documents
docstore = InMemoryDocstore(dict(enumerate(documents)))
index_to_docstore_id = {i: i for i in range(len(documents))}

# Step 4: Create the vectorstore
vectorstore = FAISS(
    index=faiss_index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embedding_func
)

# Step 5: Save the index
vectorstore.save_local("../faiss_index")

In [22]:
embedding_vectors.shape

(7600, 1024)

In [None]:
from langchain.vectorstores import FAISS

embedding_func = OllamaEmbeddingFunction()  

faiss_index = FAISS.load_local(
    folder_path="../faiss_index",
    embeddings=embedding_func,
    allow_dangerous_deserialization=True  # safe only if file is trusted
)


In [None]:
def retrieve_similar_docs(query, faiss_index, top_k=3):
    query_embedding = get_ollama_embedding(query)
    results = faiss_index.similarity_search_by_vector(query_embedding, k=top_k)
    return results  


In [28]:
query = "Generate an email requesting project status update from a colleague"
similar_docs = retrieve_similar_docs(query, faiss_index)

context = "\n---\n".join([doc.page_content for doc in similar_docs])

In [32]:
context

"peter this email will confirm the site visit by your company at the three plants\n---\n. thus, it is very important that i hear from you. thank you much kim  forwarded by kim nguyenewcenron on 06242002 1006 am  jeff duff 06212002 0622 pm to kim nguyenewcenronenron cc ronald brzezinskiewcenronenron, kevin cousineauewcenronenron, joe chapmanewcenronenron, clemens wstedeveloptwtdetwtde, markus altenschultedeveloptwtdetwtde subject re autodownload tool kim, first, hollis will be out of the office until further notice. i'll be coordinating his tasks for the time being\n---\n. please confirm to me that hal is fully dedicated to this project now and will continue to be until it is completed. i would actually like his to sit with our group at least until the project is complete. i would also like a breakdown of what the other it people listed in the elaboration document will be responsible for, and discuss the 65 day estimation with you. thanks shona"

In [41]:
def generate_answer(query, context):
    prompt = f"""
    You are a helpful assistant. Use the context below to answer or generate the requested email.

    Context:
    {context}

    Instruction:
    {query}

    Response:
    """

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "llama3.1:8b", "prompt": prompt, "stream": False}
    )
    
    return response.json()["response"]

In [42]:
query = "Generate an email requesting project status update from a colleague"
similar_docs = retrieve_similar_docs(query, faiss_index)

context = "\n---\n".join([doc.page_content for doc in similar_docs])

response = generate_answer(query, context)

In [44]:
print(response)

Here is an email to Peter confirming the site visit and requesting a project status update:

Subject: Confirmation of Site Visit and Project Update Request

Dear Peter,

I hope this email finds you well. I wanted to confirm that your company's site visit at our three plants has been scheduled, as previously discussed.

However, I also wanted to touch base with you regarding the ongoing project. As per Kim's earlier request, could you please confirm to me that Hal is fully dedicated to this project and will continue to be involved until its completion? Additionally, would it be possible for him to sit with our group at least until the project is finished?

Furthermore, I would appreciate an update on the roles and responsibilities of the other IT personnel listed in the elaboration document. Could you also provide me with a breakdown of what each team member will be responsible for during the project?

Lastly, I'd like to discuss the 65-day estimation for this project. Could we schedule

In [45]:
from IPython.display import Markdown, display

In [46]:
display(Markdown(response))

Here is an email to Peter confirming the site visit and requesting a project status update:

Subject: Confirmation of Site Visit and Project Update Request

Dear Peter,

I hope this email finds you well. I wanted to confirm that your company's site visit at our three plants has been scheduled, as previously discussed.

However, I also wanted to touch base with you regarding the ongoing project. As per Kim's earlier request, could you please confirm to me that Hal is fully dedicated to this project and will continue to be involved until its completion? Additionally, would it be possible for him to sit with our group at least until the project is finished?

Furthermore, I would appreciate an update on the roles and responsibilities of the other IT personnel listed in the elaboration document. Could you also provide me with a breakdown of what each team member will be responsible for during the project?

Lastly, I'd like to discuss the 65-day estimation for this project. Could we schedule a meeting or call at your earliest convenience to review the project timeline and any potential roadblocks?

Looking forward to hearing back from you.

Best regards,

[Your Name]

In [47]:
query = "Generate an email to remind a team member about an upcoming task deadline."
similar_docs = retrieve_similar_docs(query, faiss_index)

context = "\n---\n".join([doc.page_content for doc in similar_docs])

response = generate_answer(query, context)

In [48]:

print(response)

Here's a generated email:

Subject: Reminder: Update Your Address Book and Complete Timesheet by Monday, 4pm

Dear [Team Member],

I wanted to follow up on the memo from Constance Charles regarding the rollout of SAP. As mentioned in the memo, it's essential that we update our address book and complete our timesheets as soon as possible.

Could you please make sure to update your address book by providing us with any changes (yes/no) and let us know if you have access to a shared calendar (if yes, which one)? This will help us ensure a smooth transition during the migration process.

Additionally, please don't forget to complete your timesheet for the current period. You can input your time online at [http://ehronline.enron.com](http://ehronline.enron.com), but I'll also continue to email you reminders regarding this task.

Please confirm by responding to this email that you've updated your address book and completed your timesheet by Monday, 4pm. If there are any issues or concerns, f

In [49]:
print(context)

responsible for updating your address book no if yes, who do you have access to a shared calendar no if yes, which shared calendar do you have any distribution groups that messaging maintains for you for mass mailings no if yes, please list here please list all notes databases applications that you currently use in our efforts to plan the exact datetime of your migration, we also will need to know what are your normal work hours from 815 to 600 will you be out of the office in the near future
---
fyi. i sent this monday 4pm if you hear any rumblings
---
.  with the rollout of sap, you have the ability to go online  httpehronline.enron.com and input your time. i will continue to email for timesheets regardless if you go online. this memo is to inform you to complete your timesheet. quick reminder you may receive your present or previous paychecks  eb 3539a if delivery is not setup to your locationmail stop. thank you for your cooperation constance charles human resources associate  anal

In [50]:
display(Markdown(response))

Here's a generated email:

Subject: Reminder: Update Your Address Book and Complete Timesheet by Monday, 4pm

Dear [Team Member],

I wanted to follow up on the memo from Constance Charles regarding the rollout of SAP. As mentioned in the memo, it's essential that we update our address book and complete our timesheets as soon as possible.

Could you please make sure to update your address book by providing us with any changes (yes/no) and let us know if you have access to a shared calendar (if yes, which one)? This will help us ensure a smooth transition during the migration process.

Additionally, please don't forget to complete your timesheet for the current period. You can input your time online at [http://ehronline.enron.com](http://ehronline.enron.com), but I'll also continue to email you reminders regarding this task.

Please confirm by responding to this email that you've updated your address book and completed your timesheet by Monday, 4pm. If there are any issues or concerns, feel free to reach out to me directly.

Best regards,

[Your Name]