In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from utils import read_techcorp_docs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
docs, doc_paths = read_techcorp_docs()

model = SentenceTransformer('all-MiniLM-L6-v2')

doc_embeddings = model.encode(docs)

query = "distributed workforce policies"

query_embedding = model.encode(query)

similarities = np.dot(query_embedding,doc_embeddings.T).flatten()

top_indices = similarities.argsort()[:][::-1]

In [None]:
for i, idx in enumerate(top_indices, 1):
    doc_name = doc_paths[idx].split("/")[-1]
    print(i, similarities[idx],doc_name)

1 0.3982089 remote-work-policy.md
2 0.29151878 q3-planning-meeting.md
3 0.23645885 pet-policy.md
4 0.23560174 cloudsync-pro.md
5 0.20834768 product-launch-review.md
6 0.18836069 general-faqs.md
7 0.1730206 benefits-overview.md
8 0.099087164 datavault.md


In [None]:
def chunck_document(text,chunk_size = 500, overlap = 50):
  chunks =[]
  start = 0

  while start < len(text):
    end = start + chunk_size
    chunk = text[start:end]

    if end < len(text):
      last_period = chunk.rfind('.')
      if last_period > chunk_size*0.7:
        chunk = chunk[:last_period+1]
        end = start + last_period+1
    chunks.append(chunk)
    start = end - overlap

  return chunks




In [None]:
import chromadb

client = chromadb.Client()
collection = client.get_or_create_collection("policy_chunks")

# getting policy text
with open('./techcorp-docs/customer-faqs/general-faqs.md','r') as f:
  text = f.read()

text_chunks = chunck_document(text)

for i, chunk in enumerate(text_chunks):
  collection.add(
    documents = [chunk],
    ids = [f"chunk_{i}"]
  )

query = "What are the password requirments?"
results = collection.query(query_texts=[query], n_results=3)

for i in results["documents"][0]:
    print(i,end="\n")

1 minute

### What's the durability guarantee?
99.999999999% (eleven 9's) durability with geo-redundant storage across multiple regions.

## Troubleshooting

### Sync isn't working. What should I do?
1. Check internet connection
2. Verify you're logged in
3. Check available storage
4. Restart the app
5. Contact support if issues persist

### I forgot my password. How do I reset it?
1. Click "Forgot Password" on login page
2. Enter your email
3. Check email for reset link
4.
40-2)
- Zero-trust architecture
- Air-gap backup option
- Immutable storage
- Quantum-resistant encryption

### Can I comply with regulations?
Yes! DataVault is certified for:
- HIPAA/HITECH
- GDPR
- SOC 2
- PCI DSS
- FedRAMP (in process)

### How fast is data recovery?
- Hot tier: Instant
- Warm tier: < 1 minute
- Cold tier: < 5 minutes
- Archive tier: < 1 hour
- RTO: 15 minutes, RPO: 1 minute

### What's the durability guarantee?
99.
ault** - Military-grade secure storage solution
3. **TechCorp AI Assistant** - Co

In [None]:
results

{'ids': [['chunk_9', 'chunk_8', 'chunk_1']],
 'embeddings': None,
 'documents': [['1 minute\n\n### What\'s the durability guarantee?\n99.999999999% (eleven 9\'s) durability with geo-redundant storage across multiple regions.\n\n## Troubleshooting\n\n### Sync isn\'t working. What should I do?\n1. Check internet connection\n2. Verify you\'re logged in\n3. Check available storage\n4. Restart the app\n5. Contact support if issues persist\n\n### I forgot my password. How do I reset it?\n1. Click "Forgot Password" on login page\n2. Enter your email\n3. Check email for reset link\n4.',
   "40-2)\n- Zero-trust architecture\n- Air-gap backup option\n- Immutable storage\n- Quantum-resistant encryption\n\n### Can I comply with regulations?\nYes! DataVault is certified for:\n- HIPAA/HITECH\n- GDPR\n- SOC 2\n- PCI DSS\n- FedRAMP (in process)\n\n### How fast is data recovery?\n- Hot tier: Instant\n- Warm tier: < 1 minute\n- Cold tier: < 5 minutes\n- Archive tier: < 1 hour\n- RTO: 15 minutes, RPO: 1 

- Splitting with langchain

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(text)

In [None]:
texts

['# TechCorp Customer FAQ\n\n**Last Updated**: June 1, 2024  \n**Version**: 2.3\n\n## General Questions',
 '### What is TechCorp?',
 'TechCorp is a leading provider of enterprise cloud storage and synchronization solutions. Founded',
 'in 2015, we serve over 5,000 companies worldwide with our flagship products CloudSync Pro and',
 'DataVault.',
 '### What products does TechCorp offer?',
 '1. **CloudSync Pro** - Real-time file synchronization across all devices',
 '2. **DataVault** - Military-grade secure storage solution',
 '3. **TechCorp AI Assistant** - Coming Q4 2024',
 '### How do I contact support?\n- **Email**: support@techcorp.com',
 '- **Phone**: 1-800-TECHCORP (1-800-832-4267)\n- **Chat**: Available on our website 24/7',
 '- **Response Times**: \n  - Enterprise: 15 minutes\n  - Professional: 4 hours\n  - Basic: 24 hours',
 '## Account & Billing',
 '### How do I create an account?\n1. Visit www.techcorp.com/signup\n2. Choose your product and plan',
 '3. Enter your company infor

In [None]:
from langchain_text_splitters.spacy import SpacyTextSplitter
import spacy


nlp = spacy.load("en_core_web_sm")
spacy_splitter = SpacyTextSplitter(chunk_size=100, chunk_overlap=10)
spacy_splitter.split_text(text)

Created a chunk of size 101, which is longer than the specified 100
Created a chunk of size 117, which is longer than the specified 100
Created a chunk of size 317, which is longer than the specified 100
Created a chunk of size 242, which is longer than the specified 100
Created a chunk of size 412, which is longer than the specified 100
Created a chunk of size 164, which is longer than the specified 100
Created a chunk of size 156, which is longer than the specified 100
Created a chunk of size 170, which is longer than the specified 100
Created a chunk of size 365, which is longer than the specified 100
Created a chunk of size 228, which is longer than the specified 100
Created a chunk of size 188, which is longer than the specified 100
Created a chunk of size 130, which is longer than the specified 100
Created a chunk of size 379, which is longer than the specified 100
Created a chunk of size 254, which is longer than the specified 100
Created a chunk of size 124, which is longer tha

['# TechCorp Customer FAQ\n\n**Last Updated**: June 1, 2024  \n**Version**: 2.3\n\n## General Questions\n\n###',
 'What is TechCorp?',
 'TechCorp is a leading provider of enterprise cloud storage and synchronization solutions.',
 'Founded in 2015, we serve over 5,000 companies worldwide with our flagship products CloudSync Pro and DataVault.\n\n###',
 'What products does TechCorp offer?',
 '1. **CloudSync Pro** - Real-time file synchronization across all devices\n2.',
 '**DataVault** - Military-grade secure storage solution\n3.',
 '**TechCorp AI Assistant** - Coming Q4 2024\n\n### How do I contact support?\n- **Email**: support@techcorp.com\n- **Phone**: 1-800-TECHCORP (1-800-832-4267)\n- **Chat**: Available on our website 24/7\n- **Response Times**: \n  - Enterprise: 15 minutes\n  - Professional: 4 hours\n  - Basic: 24 hours\n\n## Account & Billing\n\n###',
 'How do I create an account?\n1.\n\nVisit www.techcorp.com/signup\n2.\n\nChoose your product and plan\n3.',
 'Enter your company

#### Complete RAG Pipeline

In [None]:
import os 
import time
from typing import List, Dict, Any
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np
# import glob
from pathlib import Path


from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.agents import create_agent

In [None]:
# section 1: loading and splitting text to chunks
def load_and_chunk_doc() -> dict:
  chunks = {}
  for file_path in Path("./techcorp-docs").rglob("*"):
    if file_path.is_file():
      with open(file_path,'r') as f:
        text = f.read()
        # print(text,"="*50,end="\n\n")

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        texts = text_splitter.split_text(text)
        chunks[file_path.name] = texts

  return chunks

# section 2 : adding chunks to vectordb
def add_to_vectordb(chunks:dict[List]):

  client = chromadb.Client()
  collection = client.get_or_create_collection("policy_chunks")

  for file_name,chunk in chunks.items():
    ids = [ f"{file_name}_{i}" for i in range(len(chunk))]
    collection.add(
        ids = ids,
        documents=chunk
    )

  return collection


# section 3: query processing
def process_user_query(query:str):
  query.lower().strip()

  return query

# section 4: vector search
def search_vectordb(collection,query:str):
    result = collection.query(query_texts=[query], n_results=3)
    return result




In [None]:
def run_rag_pipeline(query:str):

    chunks = load_and_chunk_doc()

    collection = add_to_vectordb(chunks)

    query = process_user_query(query)

    result = search_vectordb(collection, query)

    print(result)


run_rag_pipeline("distributed workforce policies")





{'ids': [['remote-work-policy.md_0', 'remote-work-policy.md_8', 'q3-planning-meeting.md_4']], 'embeddings': None, 'documents': [['# TechCorp Remote Work Policy 🏠\n\n**Effective Date**: March 15, 2024  \n**Policy Number**: HR-REM-002  \n**Last Updated**: March 1, 2024\n\n## Policy Statement\n\nTechCorp embraces flexible work arrangements to promote work-life balance and productivity. This policy outlines our hybrid work model and remote work guidelines.\n\n## Hybrid Work Schedule', '## Emergency Situations\n\nDuring emergencies (weather, health, etc.):\n- 100% remote work may be authorized\n- Essential personnel notified separately\n- Business continuity plan activated\n\n---\n*Questions? Contact HR at remotework@techcorp.com or visit the HR Portal*', '### 4. Remote Work Policy Enhancement\n**Presenter**: Amy Rodriguez\n\n**DECISION MADE**: ✅ Expanding remote work benefits\n- Co-working space allowance: Increased to $300/month\n- Home office refresh: Annual $1,000 stipend\n- Internet re

## Caching 

In [None]:
import redis
import json
import hashlib

from langchain_core.prompts import ChatPromptTemplate

In [None]:
def generate_rag_response(query,context):
  print(query)
  print(context)

In [None]:
def generate_rag_response(query, context):
    #temporory response
    print(query)
    print(context)


cache = redis.Redis(host = "localhost", port=6379, db =0)

def get_cashed_response(query, context):

  cache_key = hashlib.md5(f'{query}_{context}'.encode()).hexdigest()

  #check cache
  cached = cache.get(cache_key)
  if cached:
    return json.load(cached)
  
  response = generate_rag_response(query,context)

  #cach for 1 hour
  cache.setex(cache_key,3600,json.dumps(response))

  return response

In [None]:
# section 1: loading and splitting text to chunks
def load_and_chunk_doc() -> dict:
    chunks = {}
    for file_path in Path("./techcorp-docs").rglob("*"):
        if file_path.is_file():
            with open(file_path, "r") as f:
                text = f.read()
                # print(text,"="*50,end="\n\n")

                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=500, chunk_overlap=100
                )
                texts = text_splitter.split_text(text)
                chunks[file_path.name] = texts

    return chunks


# section 2 : adding chunks to vectordb
def add_to_vectordb(chunks: dict[List]):

    client = chromadb.Client()
    collection = client.get_or_create_collection("policy_chunks")

    for file_name, chunk in chunks.items():
        ids = [f"{file_name}_{i}" for i in range(len(chunk))]
        collection.add(ids=ids, documents=chunk)

    return collection


# section 3: query processing
def process_user_query(query: str):
    query.lower().strip()

    return query


# section 4: vector search
def search_vectordb(collection, query: str):
    result = collection.query(query_texts=[query], n_results=3)
    return result


# run_rag_pipeline("distributed workforce policies")


def generate_rag_response(query, context):

    SYSTEM_PROMPT = """You are an HR in the company.

    You are provided with the company policy data.

    You’ll be given:
    - The company policy context
    - A question from an employee

    You must answer the employee clearly and politely based only on the given context.
    """

    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY")
    )

    prompt = ChatPromptTemplate(
        [
            ("system", SYSTEM_PROMPT),
            ("user", "context:\n{context},\n question:\n{question}"),
        ]
    )


    chain = prompt | llm

    # Ask question
    response = chain.invoke({"question": query, "context": context})
    return response.content


cache = redis.Redis(host="localhost", port=6379, db=0)


def get_cashed_response(query, context):

    cache_key = hashlib.md5(f"{query}_{context}".encode()).hexdigest()

    # check cache
    cached = cache.get(cache_key)
    # print(cached)
    if cached:
        # print("yes")
        return json.loads(cached)

    response = generate_rag_response(query, context)

    # cach for 1 hour
    cache.setex(cache_key, 3600, json.dumps(response))

    return response


def run_rag_pipeline(query: str):

    chunks = load_and_chunk_doc()

    collection = add_to_vectordb(chunks)

    query = process_user_query(query)

    result = search_vectordb(collection, query)

    response = get_cashed_response(query,result)

    return response

In [None]:
run_rag_pipeline("what is the leave policy")

b'"Hello! Here is an overview of TechCorp\'s leave policy based on the information provided:\\n\\n**Holidays:**\\n*   **Company Holidays**: 12 fixed holidays\\n*   **Floating Holidays**: 3 personal choice days\\n*   **Birthday PTO**: You get your birthday off (or the nearest workday)\\n*   **Year-End Shutdown**: The company is shut down from December 24th to January 1st, which is paid time off.\\n\\n**Parental Leave:**\\n*   **Birth Parent**: 16 weeks fully paid leave\\n*   **Non-Birth Parent**: 12 weeks fully paid leave\\n*   **Adoption**: 12 weeks fully paid leave\\n*   **Gradual Return**: A part-time option is available for 4 weeks after parental leave.\\n\\nI hope this helps clarify our leave policies!"'
yes


"Hello! Here is an overview of TechCorp's leave policy based on the information provided:\n\n**Holidays:**\n*   **Company Holidays**: 12 fixed holidays\n*   **Floating Holidays**: 3 personal choice days\n*   **Birthday PTO**: You get your birthday off (or the nearest workday)\n*   **Year-End Shutdown**: The company is shut down from December 24th to January 1st, which is paid time off.\n\n**Parental Leave:**\n*   **Birth Parent**: 16 weeks fully paid leave\n*   **Non-Birth Parent**: 12 weeks fully paid leave\n*   **Adoption**: 12 weeks fully paid leave\n*   **Gradual Return**: A part-time option is available for 4 weeks after parental leave.\n\nI hope this helps clarify our leave policies!"