In [None]:
!pip install -q chromadb sentence-transformers transformers torch accelerate


**Data Documents**

In [None]:
import os

os.makedirs("data", exist_ok=True)

hr_policy = """The company promotes equal opportunity for all employees.
Employees must adhere to professional conduct at all times.
Working hours are defined by the employment contract.
Remote work is allowed based on manager approval.
Employees are entitled to paid leave as per company guidelines.
Harassment of any form will not be tolerated.
Performance reviews are conducted annually.
Employees must protect confidential information.
Training programs are provided for skill development.
HR policies may be updated periodically.
"""

company_policy = """The company aims to maintain ethical business practices.
All employees must comply with legal regulations.
Company assets should be used responsibly.
Data security is a top priority for the organization.
Employees must follow communication guidelines.
Conflicts of interest must be disclosed.
The company encourages innovation and collaboration.
Safety protocols must be followed at all locations.
Violations of policy may lead to disciplinary action.
Company policies apply to all departments equally.
"""

project_policy = """Each project must have a defined scope and timeline.
Project managers are responsible for deliverables.
Regular status updates are required from team members.
Documentation must be maintained throughout the project.
Risk assessment should be conducted early.
Changes to project scope require approval.
Quality standards must be met before delivery.
Stakeholders should be informed of major changes.
Deadlines must be respected to ensure success.
Post-project reviews help improve future projects.
"""

with open("data/hr_policy.txt", "w") as f:
    f.write(hr_policy)

with open("data/company_policy.txt", "w") as f:
    f.write(company_policy)

with open("data/project_policy.txt", "w") as f:
    f.write(project_policy)

print("Data files created")

Data files created


**Ingestion**

In [None]:
import os
import chromadb
from sentence_transformers import SentenceTransformer

DATA_DIR = "data"
DB_DIR = "vector_db"

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Chroma client
client = chromadb.Client(
    chromadb.config.Settings(persist_directory=DB_DIR)
)

# Collection
collection = client.get_or_create_collection(name="policies")

# Ingest documents
for file in os.listdir(DATA_DIR):
    file_path = os.path.join(DATA_DIR, file)

    with open(file_path, "r") as f:
        text = f.read()

    # sentence-level chunking
    sentences = [s.strip() for s in text.split(".") if s.strip()]

    # Create embeddings
    embeddings = embedding_model.encode(sentences)

    # Store in Chroma
    for i, sentence in enumerate(sentences):
        collection.add(
            documents=[sentence],
            embeddings=[embeddings[i].tolist()],
            ids=[f"{file}_{i}"],
            metadatas=[{"source": file}]
        )

print("✅ Ingestion completed successfully")

✅ Ingestion completed successfully


**Retrival**

In [None]:
def retrieve(query, top_k=3):
    query_embedding = embedding_model.encode([query]).tolist()
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )
    return results["documents"][0]


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print(llm_model.__class__)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>


**Generation**

In [None]:
def generate_answer(question):
    context = retrieve(question)
    context_text = " ".join(context)

    prompt = (
        "Answer the question using only the context. "
        "If the answer is not present, say 'Not mentioned'.\n\n"
        f"Context: {context_text}\n\n"
        f"Question: {question}\n"
        "Answer:"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    outputs = llm_model.generate(
        **inputs,
        max_new_tokens=40,
        do_sample=False,
        num_beams=1
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Answer:" in decoded:
        decoded = decoded.split("Answer:")[-1]

    return decoded.strip()

**Eval**

In [None]:
questions = [
    "What happens if someone violates company policy?",
    "Who is responsible for project deliverables?",
    "Does the company allow remote work?"
]

for q in questions:
    print("=" * 60)
    print("Q:", q)
    print("A:", generate_answer(q))

Q: What happens if someone violates company policy?
A: disciplinary action
Q: Who is responsible for project deliverables?
A: Project managers
Q: Does the company allow remote work?
A: Remote work is allowed based on manager approval
