<a href="https://colab.research.google.com/github/Ammad12345390/RAG_Chatbot-/blob/main/rag_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
!pip install -U langchain-community langchain-openai faiss-cpu pypdf python-docx docx2txt openai



In [35]:
import os
from getpass import getpass
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

In [36]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


embedding = OpenAIEmbeddings()


FOLDER_PATH = "./all_filesz"
INDEX_PATH = "./faiss_index"

Enter your OpenAI API key: ··········


In [None]:
import os
import glob
a = glob.glob('/content/all_filesz/*docx')
a

In [38]:
import os
import glob
from collections import defaultdict
import shutil

# Folder containing all your files
base_folder = "/content/all_filesz"
files = glob.glob(os.path.join(base_folder, "*.docx"))

# Define department keywords (lowercase)
department_keywords = {
    "compliance_and_legal": ["contract", "gdpr", "hipaa", "legal"],
    "marketing": ["brand", "product", "campaign", "marketing"],
    "policies": ["leave", "absence", "workplace", "remote", "policy"],
    "technical_sops": ["backend", "deployment", "pipeline", "incident", "ci-cd", "ci_cd"]
}

departments = defaultdict(list)

# Create department folders if not exist
for dept in department_keywords.keys():
    os.makedirs(os.path.join(base_folder, dept), exist_ok=True)
os.makedirs(os.path.join(base_folder, "uncategorized"), exist_ok=True)

# Categorize and move files
for file_path in files:
    file_name = os.path.basename(file_path).lower()
    assigned = False

    for dept, keywords in department_keywords.items():
        if any(keyword in file_name for keyword in keywords):
            departments[dept].append(file_name)
            # Move file to department folder
            shutil.move(file_path, os.path.join(base_folder, dept, os.path.basename(file_path)))
            assigned = True
            break

    if not assigned:
        departments["uncategorized"].append(file_name)
        shutil.move(file_path, os.path.join(base_folder, "uncategorized", os.path.basename(file_path)))

# Print result
for dept, docs in departments.items():
    print(f"{dept}: {docs}\n")


marketing: ['brand_guidelines_handbook.docx', 'q3_content_marketing_strategy.docx', 'production_incident_response_runbook.docx', 'product_launch_campaign_report.docx']

policies: ['workplace_code_of_conduct.docx', 'remote_work_&_hybrid_policy.docx', 'employee_leave_&_absence_policy.docx']

technical_sops: ['backend_service_monitoring_sop.docx', 'ci-cd_deployment_pipeline_guide.docx']

compliance_and_legal: ['contract_review_checklist (1).docx']



In [39]:
import os
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

FOLDER_PATH = "/content/all_filesz"
INDEX_PATH = "/content/faiss_index"

# Define department keywords for automatic matching
department_keywords = {
    "compliance_and_legal": ["contract", "gdpr", "hipaa", "legal"],
    "marketing": ["brand", "product", "campaign", "marketing", "q3"],
    "policies": ["leave", "absence", "workplace", "remote", "policy"],
    "technical_sops": ["backend", "deployment", "pipeline", "incident", "ci-cd", "ci_cd"]
}

embedding = OpenAIEmbeddings()

# Load existing FAISS index if it exists
if os.path.exists(os.path.join(INDEX_PATH, "index.faiss")):
    print("✅ Loading existing FAISS index...")
    vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

else:
    if not os.path.exists(FOLDER_PATH):
        os.makedirs(FOLDER_PATH)
        print(f"📂 Created '{FOLDER_PATH}'. Please add PDF/DOCX files and rerun.")
        raise SystemExit

    all_documents = []

    # Walk through all files, including subfolders
    for root, dirs, files in os.walk(FOLDER_PATH):
        for file in files:
            path = os.path.join(root, file)
            file_name, ext = os.path.splitext(file)
            normalized_file = file_name.lower().replace("_", " ").replace("-", " ")

            # Automatic department matching using keywords
            matched_dept = "uncategorized"
            for dept, keywords in department_keywords.items():
                if any(keyword in normalized_file for keyword in keywords):
                    matched_dept = dept
                    break

            # Load document
            loader = None
            if ext.lower() == ".pdf":
                loader = PyPDFLoader(path)
            elif ext.lower() == ".docx":
                loader = Docx2txtLoader(path)
            else:
                continue

            try:
                documents = loader.load()
                for i, doc in enumerate(documents, 1):
                    doc.metadata.update({
                        "source_file": file,
                        "chunk_number": i,
                        "page_number": i,
                        "department": matched_dept,
                        "title": file_name
                    })
                    print(f"📂 Department: {matched_dept}")
                    print(f"📄 Title: {file_name}")
                    print(f"➡️ Source: {file}, Chunk: {i}\n")

                all_documents.extend(documents)

            except Exception as e:
                print(f"⚠️ Could not load {file}, skipping... ({e})")
                continue

    # Build and save FAISS index
    if all_documents:
        vectorstore = FAISS.from_documents(all_documents, embedding)
        os.makedirs(INDEX_PATH, exist_ok=True)
        vectorstore.save_local(INDEX_PATH)
        print(f"✅ FAISS index saved to {INDEX_PATH}")
    else:
        print("⚠️ No documents loaded, FAISS index not created.")


✅ Loading existing FAISS index...


In [40]:
if not all_documents:
    print(f"⚠️ No valid PDF/DOCX files found in '{FOLDER_PATH}'.")
    raise SystemExit

os.makedirs(INDEX_PATH, exist_ok=True)
vectorstore = FAISS.from_documents(all_documents, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved with department + title metadata!")

✅ FAISS index created and saved with department + title metadata!


In [41]:
vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)


total_docs = len(vectorstore.docstore._dict)
print(f"Total documents in vectorstore: {total_docs}")

Total documents in vectorstore: 10


In [42]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(all_documents)


vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved!")

retriever = vectorstore.as_retriever()


policy_prompt = """
You are a Company Policy Assistant. Only answer questions using the information
provided in the company's policy documents. If the answer is not found in the
policy, say: "I cannot find that information in the policy."
Be concise, professional, and accurate.
"""

✅ FAISS index created and saved!


In [43]:
query = "GDPR rules"
departments = ["marketing", "policies", "technical_sops", "compliance_and_legal"]
k = 2  # top 2 relevant documents per department

all_results = []

for dept in departments:
    results = vectorstore.similarity_search(
        query,
        k=k,
        filter={"department": dept}  # filter by department
    )
    all_results.extend(results)

# Sort all results by relevance score if your vector store provides it
# (Some vector stores return results already sorted by similarity)

# Print only the relevant results
for doc in all_results:
    print(f"📂 Department: {doc.metadata.get('department')}")
    print(f"📄 Title: {doc.metadata.get('title')}")
    print(f"Content: {doc.page_content[:100]}...\n")



📂 Department: marketing
📄 Title: Brand Guidelines Handbook
Content: Page 3

Content Guidelines
- Tone: avoid jargon, use active voice, keep copy concise.
- Use case sto...

📂 Department: marketing
📄 Title: Brand Guidelines Handbook
Content: Brand Guidelines Handbook

Department: Marketing

Campaign ID: N/A

Region: Global

Language: Englis...

📂 Department: policies
📄 Title: Remote Work & Hybrid Policy
Content: Page 4

Security Guidelines & VPN
- All employees must use the company VPN when accessing internal s...

📂 Department: policies
📄 Title: Workplace Code of Conduct
Content: Workplace Code of Conduct

Department: HR

Region: Global

Version: v4.0

Last Updated: 2025-02-20

...

📂 Department: technical_sops
📄 Title: Backend Service Monitoring SOP
Content: Page 3

Log Management
All logs must be structured (JSON format).
Use Winston logger for Node.js app...

📂 Department: technical_sops
📄 Title: CI CD Deployment Pipeline Guide
Content: Page 4

Monitoring & Alerts
- Prometheus and G

In [None]:
print("Chatbot ready! Type 'exit' to stop.")

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break

    k = 1
    relevant_docs = retriever.get_relevant_documents(query)[:k]

    # Combine content and metadata for context
    context_list = []
    for i, doc in enumerate(relevant_docs, 1):
        context_list.append(
            f"Chunk {i} Content:\n{doc.page_content}\nMetadata: {doc.metadata}\n"
        )
    context = "\n".join(context_list)


    print("\n=== Retrieved Chunks ===")
    for doc in relevant_docs:
        print("Content:", doc.page_content[:40], "...")
        print("Metadata:", doc.metadata)
        print("-" * 30)

    messages = [
        {"role": "system", "content": policy_prompt},
        {"role": "user", "content": f"Context from documents:\n{context}\n\nQuestion: {query}"}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    print("Bot:", response.choices[0].message.content)

Chatbot ready! Type 'exit' to stop.

=== Retrieved Chunks ===
Content: Workplace Code of Conduct

Department: H ...
Metadata: {'source': '/content/all_filesz/Workplace_Code_of_Conduct.docx', 'source_file': 'Workplace_Code_of_Conduct.docx', 'chunk_number': 1, 'page_number': 1, 'department': 'policies', 'title': 'Workplace Code of Conduct'}
------------------------------
Bot: Professional Conduct refers to employees upholding professionalism at all times, which includes respecting colleagues regardless of role or background, having no tolerance for offensive, aggressive, or disruptive behavior, and maintaining confidentiality and data privacy at all times.
