In [3]:
!pip install -U langchain-community langchain-openai faiss-cpu pypdf python-docx docx2txt openai

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.30-py3-none-any.whl.metadata (2.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting openai
  Downloading openai-1.100.2-py3-none-any.whl.metadata (29 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1

In [47]:
import os
from getpass import getpass
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI


In [19]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


embedding = OpenAIEmbeddings()


FOLDER_PATH = "./all_filesz"
INDEX_PATH = "./faiss_index"

Enter your OpenAI API key: ··········


In [60]:
FOLDER_PATH = "/content/all_filesz"
INDEX_PATH = "/content/faiss_index"


# Production_Incident_Response_Runbook
# CI-CD_Deployment_Pipeline_Guide
# Department mapping
departments = {
    "compliance_and_legal": ["Contract Review Checklist", "GDPR Compliance Guidelines", "HIPAA Data Handling Policy"],
    "marketing": ["Brand Guidelines Handbook", "Product Launch Campaign Report", "Q3 Content Marketing Strategy"],
    "policies": ["Employee Leave & Absence Policy", "Remote Work & Hybrid Policy", "Workplace Code of Conduct"],
    "technical_sops": ["Backend Service Monitoring SOP", "CI CD Deployment Pipeline Guide", "Production Incident Response Runbook"]
}

embedding = OpenAIEmbeddings()

# ✅ Load existing FAISS index
if os.path.exists(f"{INDEX_PATH}/index.faiss"):
    print("✅ Loading existing FAISS index...")
    vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

else:
    if not os.path.exists(FOLDER_PATH):
        os.makedirs(FOLDER_PATH)
        print(f"📂 Created '{FOLDER_PATH}'. Please add PDF/DOCX files and rerun.")
        raise SystemExit

    all_documents = []

    # 📥 Load documents
    for file in os.listdir(FOLDER_PATH):
        path = os.path.join(FOLDER_PATH, file)
        file_name, ext = os.path.splitext(file)
        file_name = file_name.strip()

        matched_dept = ""
        matched_title = file_name

        # normalize file name (replace underscores & dashes with spaces, lowercase)
        normalized_file = file_name.lower().replace("_", " ").replace("-", " ")

        for dept, docs in departments.items():
            for title in docs:
                if title.lower() == normalized_file:
                    matched_dept = dept
                    matched_title = title
                    break   # stop after first match

        loader = None
        if ext.lower() == ".pdf":
            loader = PyPDFLoader(path)
        elif ext.lower() == ".docx":
            loader = Docx2txtLoader(path)
        else:
            continue

        try:
            documents = loader.load()

            for i, doc in enumerate(documents, 1):
              doc.metadata.update({
                  "source_file": file,
                  "chunk_number": i,
                  "page_number": i,
                  "department": matched_dept,
                  "title": matched_title
              })
            print(doc)
            all_documents.extend(documents)
        except Exception as e:
            print(f"⚠️ Could not load {file}, skipping... ({e})")
            continue

        # 🏷️ Match department + title


print("Department:", matched_dept)
print("Title:", matched_title)

page_content='Product Launch Campaign Report

Department: Marketing

Campaign ID: Q1-2025-Launch

Region: North America

Language: English

Audience: Tech Decision Makers

Page 1

Campaign Overview
Product: AI-Powered Analytics Suite v2.0
Objective: Drive awareness and conversions from enterprise tech buyers.
Channels: LinkedIn, Twitter, Email, and Google Ads.
Campaign Duration: Jan 5 - Feb 29, 2025

Page 2

Performance Metrics
- Impressions: 4.2M
- CTR: 3.1%
- MQLs Generated: 3,560
- CAC: $71.25
Top-performing asset: LinkedIn Carousel Ad (47% of leads).

Page 3

Audience Insights
- 56% Traffic from USA, 22% from Canada, 14% from UK.
- Most active job titles: CTO, Head of Data, Product Manager.
- Peak engagement time: Weekdays 10–11 AM EST.
Feedback indicated strong interest in integration with existing CRMs.

Page 4

Lessons & Recommendations
- Invest more in video content for awareness phase.
- Test gated vs. ungated whitepapers.
- Automate retargeting for mid-funnel leads.
Next phas

In [None]:
if not all_documents:
    print(f"⚠️ No valid PDF/DOCX files found in '{FOLDER_PATH}'.")
    raise SystemExit

os.makedirs(INDEX_PATH, exist_ok=True)
vectorstore = FAISS.from_documents(all_documents, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved with department + title metadata!")


In [62]:
vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

# Get the total number of documents
total_docs = len(vectorstore.docstore._dict)  # FAISS stores docs in docstore
print(f"Total documents in vectorstore: {total_docs}")

Total documents in vectorstore: 9


In [63]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(all_documents)


vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved!")

retriever = vectorstore.as_retriever()


policy_prompt = """
You are a Company Policy Assistant. Only answer questions using the information
provided in the company's policy documents. If the answer is not found in the
policy, say: "I cannot find that information in the policy."
Be concise, professional, and accurate.
"""

✅ FAISS index created and saved!


In [None]:
results = retriever.get_relevant_documents(
    "GDPR rules",
    filter={
        "$or": [
            {
                "department": "marketing",
                "title": {"$in": [
                    "Brand Guidelines Handbook",
                    "Product Launch Campaign Report",
                    "Q3 Content Marketing Strategy"
                ]}
            },
            {
                "department": "policies",
                "title": {"$in": [
                    "Employee Leave & Absence Policy",
                    "Remote Work & Hybrid Policy",
                    "Workplace Code of Conduct"
                ]}
            },
            {
                "department": "technical sops",
                "title": {"$in": [
                    "Backend Service Monitoring SOP",
                    "CI/CD Deployment Pipeline Guide",
                    "Production Incident Response Runbook"
                ]}
            },
            {
                "department": "compliance_and_legal",
                "title": {"$in": [
                    "Contract Review Checklist",
                    "GDPR Compliance Guidelines",
                    "HIPAA Data Handling Policy"
                ]}
            }
        ]
    }
)

for doc in results:
    print(f"📂 Department: {doc.metadata.get('department')}")
    print(f"📄 Title: {doc.metadata.get('title')}")
    print(f"Content: {doc.page_content[:60]}...\n")


In [None]:
print("Chatbot ready! Type 'exit' to stop.")

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break

    k = 1
    relevant_docs = retriever.get_relevant_documents(query)[:k]

    # Combine content and metadata for context
    context_list = []
    for i, doc in enumerate(relevant_docs, 1):
        context_list.append(
            f"Chunk {i} Content:\n{doc.page_content}\nMetadata: {doc.metadata}\n"
        )
    context = "\n".join(context_list)
    # print("\n=== Context ===")
    # print(context)
    # print("\n==== End ====")

    print("\n=== Retrieved Chunks ===")
    for doc in relevant_docs:
        print("Content:", doc.page_content[:40], "...")
        print("Metadata:", doc.metadata)
        print("-" * 30)

    messages = [
        {"role": "system", "content": policy_prompt},
        {"role": "user", "content": f"Context from documents:\n{context}\n\nQuestion: {query}"}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    print("Bot:", response.choices[0].message.content)


Chatbot ready! Type 'exit' to stop.
You: what is Strategy Overview

=== Retrieved Chunks ===
Content: Q3 Content Marketing Strategy

Departmen ...
Metadata: {'source': '/content/all_filesz/Q3_Content_Marketing_Strategy.docx', 'source_file': 'Q3_Content_Marketing_Strategy.docx', 'chunk_number': 1, 'page_number': 1, 'department': 'marketing', 'title': 'Q3 Content Marketing Strategy'}
------------------------------
Bot: The Strategy Overview is centered around the theme "Scaling with AI – Smart Growth for SaaS," with objectives to build thought leadership through long-form blogs and video content, and to generate over 2,500 Marketing Qualified Leads (MQLs) from inbound organic traffic. The core markets are Singapore, UAE, and Germany.
