# Step 1: Load and Chunk Documents

In [57]:
import os
from dotenv import load_dotenv

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in environment variables.")

In [58]:
from langchain.document_loaders import TextLoader

loader = TextLoader("docs/internal_manual.txt")
docs = TextLoader("docs/internal_manual.txt", encoding="utf-8").load()

docs[0].page_content

'# Internal Deployment Standards and DevOps Practices\n\n## 1. Introduction\n\nThis document outlines the deployment and infrastructure practices followed by the engineering team at ACME Corp. The goal is to ensure reliable, repeatable, and secure deployments across environments.\n\n## 2. Environments\n\nWe use three main environments:\n- **Development**: For local dev and testing\n- **Staging**: Mirrors production, runs CI smoke tests\n- **Production**: Customer-facing environment, under strict change control\n\n## 3. CI/CD Pipeline\n\nAll services are deployed using GitHub Actions and ArgoCD. The typical pipeline includes:\n1. Lint and format check (pre-commit)\n2. Unit tests with coverage threshold (80% minimum)\n3. Docker build and push to internal registry\n4. Helm chart update and deployment via ArgoCD\n\nChanges to `main` auto-deploy to staging.\nProduction deploys require an approved GitHub PR with `#deploy-production` tag and a passing security scan.\n\n## 4. Secrets and Confi

In [59]:
from langchain.document_loaders import PyPDFLoader
# Ensure the PyPDFLoader is imported correctly
assert PyPDFLoader is not None


# Load the PDF file
loader = PyPDFLoader("US_Employment_Contract_Template.pdf")
docs = loader.load()

print(docs)

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-15T18:16:29+03:00', 'author': 'carlos el badawi', 'moddate': '2025-08-15T18:16:29+03:00', 'source': 'US_Employment_Contract_Template.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='US Employment Contract Template \n \n \nHow to use this template \nThis is a customizable document created to help companies solidify the terms and conditions of \nemployment between employee and employer by defining the responsibilities and obligations of \nthe working relationship. \n \nThis template is tailored to US employment standards, including all necessary elements such as \nemployer and employee details, job description, salary, working hours, holiday entitlement, and \nmore to ensure compliance with local employment laws. \n \nDisclaimer: The content in this document is provided for general information purposes only and does not \nconst

In [62]:
from typing import List
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
from langchain.schema import Document

def semantic_chunker(text: str, similarity_threshold: float = 0.85) -> List[Document]: 
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Load pre-trained embedding model
    model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
    
    # Encode sentences into embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    chunks = []
    current_chunk = [sentences[0]]
    
    # Compare sentence similarity to decide chunk boundaries
    for i in range(1, len(sentences)):
        similarity = util.pytorch_cos_sim(embeddings[i - 1], embeddings[i]).item()
        if similarity > similarity_threshold:
            current_chunk.append(sentences[i])
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentences[i]]
    
    # Add the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
  
    return [Document(page_content=chunk) for chunk in chunks]


In [63]:
full_text = " ".join([doc.page_content for doc in docs])
chunks = semantic_chunker(full_text)
len(chunks)


135

# Step 2: Embed and Index with FAISS

In [64]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
# from langchain_huggingface import HuggingFaceEmbeddings



embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
db = FAISS.from_documents(chunks, embedding_model)

# Save index locally
db.save_local("faiss_index_google")

# Step 3: User Query and Retrieval

In [65]:
retriever = db.as_retriever(
  search_type="mmr",
  search_kwargs={'k': 3} 
)

query = "What are embeddings?"
retrieved_docs = retriever.get_relevant_documents(query)

for i, doc in enumerate(retrieved_docs):
    print(f"\n\nDOC: [{i}] {doc.page_content[:100]}...")



DOC: [0] Non-Disparagement....


DOC: [1] US Employment Contract Template 
 
 
How to use this template 
This is a customizable document creat...


DOC: [2] 3.3 18 U.S.C....


# Step 4: Prompt and Generate Response

## Prompt Construction

In [66]:
context = "\n\n".join([doc.page_content for doc in retrieved_docs])
prompt = f"""Use the following context to answer the question.

Context:
{context}

Question: {query}
Answer:"""

In [67]:
print(prompt)

Use the following context to answer the question.

Context:
Non-Disparagement.

US Employment Contract Template 
 
 
How to use this template 
This is a customizable document created to help companies solidify the terms and conditions of 
employment between employee and employer by defining the responsibilities and obligations of 
the working relationship.

3.3 18 U.S.C.

Question: What are embeddings?
Answer:


In [None]:
from google import genai

client = genai.Client(api_key=GOOGLE_API_KEY)

response = client.models.generate_content(
    model="gemini-2.5-flash", contents=prompt
)
print(response.text)

According to Exhibit B, specifically Section 2870(a) of the California Labor Code, an invention is excluded from assignment to the employer if the employee developed it entirely on their own time without using the employer’s equipment, supplies, facilities, or trade secret information, AND the invention does not:
(1) Relate at the time of conception or reduction to practice to the employer’s business, or actual or demonstrably anticipated research or development of the employer; or
(2) Result from any work performed by the employee for the employer.


In [72]:
prompts = [
    # --- Fact-based prompts ---
    {
        "query": "What is the employment type mentioned in the contract template?",
        "expected_output": "Employment can be Salary or Hourly, as specified in Point 7 of the Table."
    },
    {
        "query": "What does the agreement state about the employment relationship being 'at will'?",
        "expected_output": "The employment is at will, meaning either the employee or the company may terminate the employment at any time, for any reason or no reason."
    },
    {
        "query": "How many days of Paid Time Off (PTO) are granted per year in the contract?",
        "expected_output": "The number of PTO days is customizable and specified in Point 9 of the Table (e.g., '[INSERT NUMBER OF DAYS] days of Paid Time Off per year')."
    },
    {
        "query": "What are the company’s policies regarding unused PTO days?",
        "expected_output": "Unused PTO is not paid out and does not roll over to the next year unless required by law."
    },
    {
        "query": "Under what conditions may an employee receive a discretionary bonus?",
        "expected_output": "At the sole discretion of the company, based on the employee’s performance and the company’s performance."
    },
    {
        "query": "What does the agreement require employees to provide before commencing employment under U.S. law?",
        "expected_output": "Employees must provide a completed Form I-9 to verify their employment eligibility and identity."
    },
    {
        "query": "What must the employee return to the company upon termination of employment?",
        "expected_output": "All company property, including phones, computers, keys, documents, reports, business cards, electronic files, and other equipment."
    },
    {
        "query": "What are the main obligations under the confidentiality clause in Exhibit A?",
        "expected_output": "The employee must not disclose, copy, remove, or use company confidential information during or after employment without prior written consent."
    },
    {
        "query": "According to Exhibit B, under California law, which inventions are excluded from assignment to the employer?",
        "expected_output": "Inventions developed entirely on the employee’s own time without using the employer’s resources, unless they relate to the employer’s business or result from work performed for the employer."
    },
    {
        "query": "What law governs the terms of the employment agreement?",
        "expected_output": "The agreement is governed by the laws of the state specified in the contract (placeholder: '[INSERT EMPLOYEE STATE HERE]')."
    },

    # --- Reasoning-style prompts ---
    {
        "query": "If an employee works full-time but wants to take vacation, how is this covered in the contract?",
        "expected_output": "Vacation is covered under Paid Time Off (PTO), which is available to full-time employees and can be used for vacation, personal time, or sick leave as specified in Point 9 of the Table."
    },
    {
        "query": "If an employee is terminated without cause, what benefits might the company offer?",
        "expected_output": "The company may, at its sole discretion, offer severance benefits. These are conditional on the employee signing a full release of claims against the company and not revoking the release."
    },
    {
        "query": "What are the employee’s obligations regarding inventions created during their employment?",
        "expected_output": "Any invention or development related to the company’s business, created during employment or using company resources, is automatically assigned to the company. Employees must also assist with securing intellectual property rights for such inventions."
    },
    {
        "query": "How does the contract address confidentiality and non-disparagement together?",
        "expected_output": "The confidentiality clause prohibits disclosure or misuse of company information, while the non-disparagement clause prohibits criticizing the company or its employees during or after employment. Both obligations survive termination of employment."
    },
    {
        "query": "What happens if a clause in the contract is found invalid?",
        "expected_output": "The remaining provisions continue in full force and effect. Invalidity of one clause does not affect the validity or enforceability of the rest of the agreement."
    }
]


In [None]:

import torch

embed_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

def semantic_score(expected, generated):
    emb1 = embed_model.encode(expected, convert_to_tensor=True)
    emb2 = embed_model.encode(generated, convert_to_tensor=True)
    return float(util.pytorch_cos_sim(emb1, emb2))



results = []

for case in prompts:
    retrieved_docs = retriever.get_relevant_documents(case["query"])
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    prompt = f"""Use the following context to answer the question, do not include any personal opinions or information, answer ONLY based on the context provided. If no context is available tell the user you don't know.
    If there are [] in the document especially if they are highlighted and in bold, consider them as filler for dates or numbers or names.

    Context:
    {context}

    Question: {case["query"]}
    Answer:"""

    response = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents=prompt
    )
    generated = response.text

    score = semantic_score(case["expected_output"], generated)
    
    results.append({
        "prompt": case["query"],
        "expected": case["expected_output"],
        "generated": generated,
        "semantic_score": score
    })
    
   
    print(f"Prompt: {case['query']}")
    print(f"Semantic Score: {score:.2f}")
    print(f"Generated Answer: {generated}")
    print(f"Expected Answer: {case['expected_output']}")
    print("-" * 80)


avg_semantic_score = sum(r["semantic_score"] for r in results) / len(results)
print(f"Average Semantic Similarity: {avg_semantic_score:.2f}")

Prompt: What is the employment type mentioned in the contract template?
Semantic Score: 0.81
Generated Answer: The employment type is specified in Point 8 of the Table.

Expected Answer: Employment can be Salary or Hourly, as specified in Point 7 of the Table.
--------------------------------------------------------------------------------
Prompt: What does the agreement state about the employment relationship being 'at will'?
Semantic Score: 0.82
Generated Answer: The Employee's employment will, at all times, remain “at will” employment.

Expected Answer: The employment is at will, meaning either the employee or the company may terminate the employment at any time, for any reason or no reason.
--------------------------------------------------------------------------------
Prompt: How many days of Paid Time Off (PTO) are granted per year in the contract?
Semantic Score: 0.89
Generated Answer: The document states the number of days of Paid Time Off ("PTO") per year is set forth in Poin