In [23]:
!pip install -U langchain-community langchain-openai faiss-cpu pypdf python-docx docx2txt openai



In [24]:
import os
from getpass import getpass
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

In [25]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


embedding = OpenAIEmbeddings()


FOLDER_PATH = "./all_filesz"
INDEX_PATH = "./faiss_index"

Enter your OpenAI API key: ··········


In [26]:
import glob
import os

departments = ["marketing", "policies", "technical_sops", "compliance_and_legal"]

all_files = {}

for dept in departments:
    folder_path = f"/content/all_filesz/{dept}/*.docx"
    files = glob.glob(folder_path)

    all_files[dept] = files


for dept, files in all_files.items():
    for f in files:
        print(f"{os.path.basename(f)} --> {dept}")


Q3_Content_Marketing_Strategy.docx --> marketing
Brand_Guidelines_Handbook.docx --> marketing
Product_Launch_Campaign_Report.docx --> marketing
Workplace_Code_of_Conduct.docx --> policies
Remote_Work_&_Hybrid_Policy.docx --> policies
Employee_Leave_&_Absence_Policy.docx --> policies
CI-CD_Deployment_Pipeline_Guide.docx --> technical_sops
Backend_Service_Monitoring_SOP.docx --> technical_sops
Production_Incident_Response_Runbook.docx --> technical_sops
GDPR_Compliance_Guidelines.docx --> compliance_and_legal
HIPAA_Data_Handling_Policy.docx --> compliance_and_legal
Contract_Review_Checklist.docx --> compliance_and_legal


In [27]:
FOLDER_PATH = "/content/all_filesz"
INDEX_PATH = "/content/faiss_index"


departments = ["marketing", "policies", "technical_sops", "compliance_and_legal"]

embedding = OpenAIEmbeddings()

# ✅ Load existing FAISS index if available
if os.path.exists(f"{INDEX_PATH}/index.faiss"):
    print("✅ Loading existing FAISS index...")
    vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

else:
    if not os.path.exists(FOLDER_PATH):
        os.makedirs(FOLDER_PATH)
        print(f"📂 Created '{FOLDER_PATH}'. Please add department subfolders and rerun.")
        raise SystemExit

    all_documents = []


    for dept in departments:
        folder_path = f"{FOLDER_PATH}/{dept}/*"
        files = glob.glob(folder_path)

        for file in files:
            file_name, ext = os.path.splitext(os.path.basename(file))
            file_name = file_name.strip()

            loader = None
            if ext.lower() == ".pdf":
                loader = PyPDFLoader(file)
            elif ext.lower() == ".docx":
                loader = Docx2txtLoader(file)
            else:
                continue

            try:
                documents = loader.load()

                for i, doc in enumerate(documents, 1):
                    doc.metadata.update({
                        "source_file": os.path.basename(file),
                        "chunk_number": i,
                        "page_number": i,
                        "department": dept,
                        "title": file_name
                    })


                    print(f"📂 Department: {dept}")
                    print(f"📄 Title: {file_name}")
                    print(f"➡️ Source: {os.path.basename(file)}, Chunk: {i}\n")

                all_documents.extend(documents)

            except Exception as e:
                print(f"⚠️ Could not load {file}, skipping... ({e})")
                continue


✅ Loading existing FAISS index...


In [33]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(all_documents)


vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved!")



✅ FAISS index created and saved!


In [29]:
policy_prompt = """
You are a Company Policy Assistant. Only answer questions using the information provided in the
company's policy documents.
If the answer is not found in the policy,
say: I cannot find that information in the policy.
 Be concise, professional, and accurate."""

In [None]:
print("Chatbot ready! Type 'exit' to stop.")

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break


    classification_prompt = f"""
    You are a smart classifier. Classify the following query into ONE of these departments:
    - marketing
    - policies
    - technical_sops
    - compliance_and_legal

    Query: "{query}"

    Respond ONLY with the department name.
    """

    classify_response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[{"role": "system", "content": classification_prompt}],
        max_tokens=10,
        temperature=0
    )

    department = classify_response.choices[0].message.content.strip().lower()
    print(f"🔍 GPT says this query is related to: {department}")


    relevant_docs = vectorstore.similarity_search(
        query,
        k=1,
        filter={"department": department}
    )


    context_list = []
    for i, doc in enumerate(relevant_docs, 1):
        context_list.append(
            f"Chunk {i} Content:\n{doc.page_content}\nDepartment: {doc.metadata.get('department')}\n"
        )
    context = "\n".join(context_list)

    print("\n=== Retrieved Chunks ===")
    for doc in relevant_docs:
        print("Content:", doc.page_content[:50], "...")
        print("Metadata:", doc.metadata)
        print("-" * 30)


    messages = [
        {"role": "system", "content": policy_prompt},
        {
            "role": "user",
            "content": f"Department: {department}\n\nContext:\n{context}\n\nQuestion: {query}"
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    print("Bot:", response.choices[0].message.content)


Chatbot ready! Type 'exit' to stop.
You: what is Overview of CI/CD
🔍 GPT says this query is related to: technical_sops

=== Retrieved Chunks ===
Content: CI/CD Deployment Pipeline Guide

Department: Engin ...
Metadata: {'source': '/content/all_filesz/technical_sops/CI-CD_Deployment_Pipeline_Guide.docx', 'source_file': 'CI-CD_Deployment_Pipeline_Guide.docx', 'chunk_number': 1, 'page_number': 1, 'department': 'technical_sops', 'title': 'CI-CD_Deployment_Pipeline_Guide'}
------------------------------
Bot: The Overview of CI/CD describes the automated workflow for deploying microservices, including the tools used (GitHub Actions, Docker, Kubernetes, Helm, ArgoCD), the separation of staging and production environments, and the management of pipelines via YAML configuration files.
