<a href="https://colab.research.google.com/github/Ammad12345390/RAG_Chatbot-/blob/main/rag_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install -U langchain-community langchain-openai faiss-cpu pypdf python-docx docx2txt openai



In [9]:
import os
from getpass import getpass
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

In [10]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


embedding = OpenAIEmbeddings()


FOLDER_PATH = "./all_filesz"
INDEX_PATH = "./faiss_index"

Enter your OpenAI API key: ··········


In [11]:
FOLDER_PATH = "/content/all_filesz"
INDEX_PATH = "/content/faiss_index"

departments = {
    "compliance_and_legal": [
        "Contract Review Checklist",
        "GDPR Compliance Guidelines",
        "HIPAA Data Handling Policy"
    ],
    "marketing": [
        "Brand Guidelines Handbook",
        "Product Launch Campaign Report",
        "Q3 Content Marketing Strategy"
    ],
    "policies": [
        "Employee Leave & Absence Policy",
        "Remote Work & Hybrid Policy",
        "Workplace Code of Conduct"
    ],
    "technical_sops": [
        "Backend Service Monitoring SOP",
        "CI CD Deployment Pipeline Guide",
        "Production Incident Response Runbook"
    ]
}

embedding = OpenAIEmbeddings()

# ✅ Load existing FAISS index
if os.path.exists(f"{INDEX_PATH}/index.faiss"):
    print("✅ Loading existing FAISS index...")
    vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

else:
    if not os.path.exists(FOLDER_PATH):
        os.makedirs(FOLDER_PATH)
        print(f"📂 Created '{FOLDER_PATH}'. Please add PDF/DOCX files and rerun.")
        raise SystemExit

    all_documents = []

    for file in os.listdir(FOLDER_PATH):
        path = os.path.join(FOLDER_PATH, file)
        file_name, ext = os.path.splitext(file)
        file_name = file_name.strip()

        matched_dept = ""
        matched_title = file_name

        # Normalize file name for matching
        normalized_file = file_name.lower().replace("_", " ").replace("-", " ")

        # 🔍 Match department and title
        for dept, docs in departments.items():
            for title in docs:
                if title.lower() == normalized_file:
                    matched_dept = dept
                    matched_title = title
                    break

        loader = None
        if ext.lower() == ".pdf":
            loader = PyPDFLoader(path)
        elif ext.lower() == ".docx":
            loader = Docx2txtLoader(path)
        else:
            continue

        try:
            documents = loader.load()

            for i, doc in enumerate(documents, 1):
                doc.metadata.update({
                    "source_file": file,
                    "chunk_number": i,
                    "page_number": i,
                    "department": matched_dept,
                    "title": matched_title
      })

                # ✅ Print info for each document
                print(f"📂 Department: {matched_dept}")
                print(f"📄 Title: {matched_title}")
                print(f"➡️ Source: {file}, Chunk: {i}\n")

            all_documents.extend(documents)

        except Exception as e:
            print(f"⚠️ Could not load {file}, skipping... ({e})")
            continue


📂 Department: marketing
📄 Title: Brand Guidelines Handbook
➡️ Source: Brand_Guidelines_Handbook.docx, Chunk: 1

📂 Department: policies
📄 Title: Workplace Code of Conduct
➡️ Source: Workplace_Code_of_Conduct.docx, Chunk: 1

📂 Department: marketing
📄 Title: Q3 Content Marketing Strategy
➡️ Source: Q3_Content_Marketing_Strategy.docx, Chunk: 1

📂 Department: technical_sops
📄 Title: Backend Service Monitoring SOP
➡️ Source: Backend_Service_Monitoring_SOP.docx, Chunk: 1

📂 Department: technical_sops
📄 Title: Production Incident Response Runbook
➡️ Source: Production_Incident_Response_Runbook.docx, Chunk: 1

📂 Department: technical_sops
📄 Title: CI CD Deployment Pipeline Guide
➡️ Source: CI-CD_Deployment_Pipeline_Guide.docx, Chunk: 1

📂 Department: 
📄 Title: Contract_Review_Checklist (1)
➡️ Source: Contract_Review_Checklist (1).docx, Chunk: 1

📂 Department: policies
📄 Title: Remote Work & Hybrid Policy
➡️ Source: Remote_Work_&_Hybrid_Policy.docx, Chunk: 1

📂 Department: policies
📄 Title: Empl

In [12]:
if not all_documents:
    print(f"⚠️ No valid PDF/DOCX files found in '{FOLDER_PATH}'.")
    raise SystemExit

os.makedirs(INDEX_PATH, exist_ok=True)
vectorstore = FAISS.from_documents(all_documents, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved with department + title metadata!")


✅ FAISS index created and saved with department + title metadata!


In [13]:
vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

# Get the total number of documents
total_docs = len(vectorstore.docstore._dict)  # FAISS stores docs in docstore
print(f"Total documents in vectorstore: {total_docs}")

Total documents in vectorstore: 10


In [14]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(all_documents)


vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved!")

retriever = vectorstore.as_retriever()


policy_prompt = """
You are a Company Policy Assistant. Only answer questions using the information
provided in the company's policy documents. If the answer is not found in the
policy, say: "I cannot find that information in the policy."
Be concise, professional, and accurate.
"""

✅ FAISS index created and saved!


In [15]:
results = retriever.get_relevant_documents(
    "GDPR rules",
    filter={
        "$or": [
            {
                "department": "marketing",
                "title": {"$in": [
                    "Brand Guidelines Handbook",
                    "Product Launch Campaign Report",
                    "Q3 Content Marketing Strategy"
                ]}
            },
            {
                "department": "policies",
                "title": {"$in": [
                    "Employee Leave & Absence Policy",
                    "Remote Work & Hybrid Policy",
                    "Workplace Code of Conduct"
                ]}
            },
            {
                "department": "technical sops",
                "title": {"$in": [
                    "Backend Service Monitoring SOP",
                    "CI/CD Deployment Pipeline Guide",
                    "Production Incident Response Runbook"
                ]}
            },
            {
                "department": "compliance_and_legal",
                "title": {"$in": [
                    "Contract Review Checklist",
                    "GDPR Compliance Guidelines",
                    "HIPAA Data Handling Policy"
                ]}
            }
        ]
    }
)

for doc in results:
    print(f"📂 Department: {doc.metadata.get('department')}")
    print(f"📄 Title: {doc.metadata.get('title')}")
    print(f"Content: {doc.page_content[:60]}...\n")


  results = retriever.get_relevant_documents(


📂 Department: policies
📄 Title: Remote Work & Hybrid Policy
Content: Page 4

Security Guidelines & VPN
- All employees must use t...

📂 Department: policies
📄 Title: Workplace Code of Conduct
Content: Workplace Code of Conduct

Department: HR

Region: Global

V...

📂 Department: marketing
📄 Title: Brand Guidelines Handbook
Content: Page 3

Content Guidelines
- Tone: avoid jargon, use active ...

📂 Department: policies
📄 Title: Workplace Code of Conduct
Content: Page 3

Conflict of Interest & Gifts
- Employees must disclo...



In [None]:
print("Chatbot ready! Type 'exit' to stop.")

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break

    k = 1
    relevant_docs = retriever.get_relevant_documents(query)[:k]

    # Combine content and metadata for context
    context_list = []
    for i, doc in enumerate(relevant_docs, 1):
        context_list.append(
            f"Chunk {i} Content:\n{doc.page_content}\nMetadata: {doc.metadata}\n"
        )
    context = "\n".join(context_list)


    print("\n=== Retrieved Chunks ===")
    for doc in relevant_docs:
        print("Content:", doc.page_content[:40], "...")
        print("Metadata:", doc.metadata)
        print("-" * 30)

    messages = [
        {"role": "system", "content": policy_prompt},
        {"role": "user", "content": f"Context from documents:\n{context}\n\nQuestion: {query}"}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    print("Bot:", response.choices[0].message.content)

Chatbot ready! Type 'exit' to stop.
You: what is Overview of CI/CD

=== Retrieved Chunks ===
Content: CI/CD Deployment Pipeline Guide

Departm ...
Metadata: {'source': '/content/all_filesz/CI-CD_Deployment_Pipeline_Guide.docx', 'source_file': 'CI-CD_Deployment_Pipeline_Guide.docx', 'chunk_number': 1, 'page_number': 1, 'department': 'technical_sops', 'title': 'CI CD Deployment Pipeline Guide'}
------------------------------
Bot: The overview of CI/CD in the document outlines the automated workflow for deploying microservices, including the tools used (GitHub Actions, Docker, Kubernetes, Helm, ArgoCD), the separation of staging and production environments, and the management of pipelines via YAML configuration files in the root directory.
