In [1]:
from pathlib import Path
from typing import List, Dict, Any
from tqdm import tqdm
from pypdf import PdfReader
import pandas as pd
import numpy
import os
from docx import Document
import re
import warnings
import ollama
from langchain_core.documents import Document as LCDocument 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import Ollama

DATA_DIR = Path("data")
DB_DIR = "db"
COLLECTION_NAME = "policies"
DATA_DIR.mkdir(exist_ok=True)
pd.options.display.max_colwidth = 200
pd.options.display.max_seq_items = 200
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_txt(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def load_docx(path: Path) -> str:
    doc = Document(str(path))
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def load_pdf(path: Path) -> str:
    reader = PdfReader(str(path))
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        if text.strip():
            pages.append(text)
    return "\n".join(pages)

def load_documents(data_dir: Path) -> List[Dict[str, Any]]:
    docs = []
    for p in data_dir.rglob("*"):
        if p.is_dir():
            continue
        ext = p.suffix.lower()
        if ext not in [".pdf", ".docx", ".txt"]:
            continue

        if ext == ".pdf":
            text = load_pdf(p)
        elif ext == ".docx":
            text = load_docx(p)
        else:
            text = load_txt(p)

        if text.strip():
            docs.append({"text": text, "source": p.name, "path": str(p)})
    return docs


In [3]:
raw_docs = load_documents(DATA_DIR)
len(raw_docs), [d["source"] for d in raw_docs[:5]]

(6,
 ['2021 PTO Policy.pdf',
  'Access the MyInfo Portal - DM Payroll.pdf',
  'CMS Creditable Coverage Notice Final 2020_10_13.pdf',
  'Computer Use Policy.pdf',
  'EMPLOYEE NOVATIME WEB SERVICES INSTRUCTIONS.pdf'])

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""],
)

lc_docs = []
for d in raw_docs:
    chunks = splitter.split_text(d["text"])
    for idx, chunk in enumerate(chunks):
        lc_docs.append(
            LCDocument(
                page_content=chunk,
                metadata={"source": d["source"], "path": d["path"], "chunk": idx},
            )
        )

embeddings = OllamaEmbeddings(model="nomic-embed-text")

vectordb = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=DB_DIR,
    embedding_function=embeddings,
)

batch_size = 200
for i in tqdm(range(0, len(lc_docs), batch_size), desc="Indexing"):
    vectordb.add_documents(lc_docs[i:i+batch_size])

vectordb.persist()
print(f"✅ Indexed {len(lc_docs)} chunks into ./{DB_DIR}")


Indexing: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [07:00<00:00, 420.92s/it]

✅ Indexed 187 chunks into ./db





In [8]:
SYSTEM_RULES = """You answer questions using ONLY the provided policy excerpts.
If the answer is not in the excerpts, say: "I couldn't find that in the provided documents."
Be precise. If a rule has conditions/exceptions, include them.
Always include citations as: (source, chunk).
"""

llm = Ollama(model="llama3.1:8b", temperature=0.2)

def format_context(hits):
    out = []
    for h in hits:
        src = h.metadata.get("source")
        ch = h.metadata.get("chunk")
        out.append(f"[{src} | chunk {ch}] {h.page_content}")
    return "\n\n".join(out)

def ask(question: str, k: int = 5):
    retriever = vectordb.as_retriever(search_kwargs={"k": k})

    hits = retriever.invoke(question)

    context = format_context(hits)

    prompt = f"""{SYSTEM_RULES}

    Question: {question}

    Policy excerpts:
    {context}

    Answer (with citations):
    """
    return llm.invoke(prompt)

In [9]:
print(ask("What's the resignation notice period?"))

The resignation notice period is at least a 2 week notice, with no guarantee that the request will be granted (handbook.pdf | chunk 69). 

Additionally, to receive pay for unused PTO earned from the last calendar year upon termination, employees must provide a written two-week notice of resignation and work their scheduled hours over the two-week resignation period (2021 PTO Policy.pdf | chunk 3 and [handbook.pdf | chunk 106]).


In [10]:
print(ask("How many leaves each year?"))

There is no specific information on how many leaves each year. However, it mentions that up to 72 hours of PTO can be used per year after it is accrued (earned) and up to 72 hours of PTO can be rolled over from one year to the next (handbook.pdf | chunk 104).


In [11]:
print(ask("What are the things that I should not do in the workplace?"))

Based on the provided policy excerpts, here is a list of things that you should not do in the workplace:

* Engage in any activity that is illegal under local, state, federal or international law while utilizing Holiday Market-owned resources. (Computer Use Policy.pdf | chunk 9)
* Participate in system and network activities that are strictly prohibited, with no exceptions (e.g. hacking, unauthorized access). (Computer Use Policy.pdf | chunk 9)
* Make threats, even as a joke or prank, during working hours or in company vehicles. (handbook.pdf | chunk 34)
* Commit violent acts or threaten violence during non-working hours or away from the workplace, unless:
	+ The associate's conduct adversely affects the Company's reputation. (A) (handbook.pdf | chunk 34)
	+ The Company determines that the effects of the off-duty conduct may be carried into the workplace and/or pose a threat to Company associates, visitors or property. (B) (handbook.pdf | chunk 34)
	+ The conduct results in the convict

In [12]:
print(ask("What is the policy for health insurace?"))

The policy for health insurance is not explicitly stated in the provided excerpts, but there are mentions of related policies:

* Holiday Market pays Workers’ Compensation pursuant to state regulations for injuries on the job or occupation disease. (Source: handbook.pdf, chunk 116)
* The company may require an employee to provide a doctor’s certification of serious health condition for certain types of leave. (Source: handbook.pdf, chunk 116)
* Medicare prescription drug coverage became available in 2006 and can be obtained by joining a Medicare Prescription Drug Plan or a Medicare Advantage Plan that offers prescription drug coverage. (Source: CMS Creditable Coverage Notice Final 2020_10_13.pdf, chunk 2)

I couldn't find any information on the specific health insurance policy offered by Holiday Market.


In [13]:
print(ask("What is the restriction about the computer usage?"))

Here is the answer based on the provided policy excerpts:

**Restriction about computer usage:**

* All computing devices must be secured with a password-protected screensaver with the automatic activation feature set to 10 minutes or less. You must lock the screen or log off when the device is unattended. (4.2.1)
* Employees must use extreme caution when opening e-mail attachments received from unknown senders, which may contain malware. (4.2.3)
* Accessing data, a server or an account for any purpose other than conducting Holiday Market business is prohibited. (4.3.1)
* Exporting software, technical information, encryption software or technology in violation of international or regional export control laws is illegal. (4.3)
* Introduction of malicious programs into the network or server is prohibited. (5)
* Revealing your account password to others or allowing use of your account by others is prohibited, including family and other household members when work is being done at home. (6