In [27]:
!pip install -U langchain-community langchain-openai faiss-cpu pypdf python-docx docx2txt openai



In [28]:
import os
from getpass import getpass
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI

In [29]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


embedding = OpenAIEmbeddings()

Enter your OpenAI API key: ··········


In [30]:
# 📂 Paths
FOLDER_PATH = "/content/wedlii_docs"
INDEX_PATH = "/content/faiss_index"

# 🔑 Embeddings
embedding = OpenAIEmbeddings()

# ✅ Load existing FAISS index if available
if os.path.exists(f"{INDEX_PATH}/index.faiss"):
    print("✅ Loading existing FAISS index...")
    vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

else:
    # 📂 If no docs folder, create it and stop
    if not os.path.exists(FOLDER_PATH):
        os.makedirs(FOLDER_PATH)
        print(f"📂 Created '{FOLDER_PATH}'. Please add Wedlii PDF/DOCX files and rerun.")
        raise SystemExit

    all_documents = []

    # 📥 Load all files inside folder
    for file in os.listdir(FOLDER_PATH):
        path = os.path.join(FOLDER_PATH, file)
        file_name, ext = os.path.splitext(file)

        loader = None
        if ext.lower() == ".pdf":
            loader = PyPDFLoader(path)
        elif ext.lower() == ".docx":
            loader = Docx2txtLoader(path)
        else:
            continue  # skip unsupported formats

        try:
            documents = loader.load()

            # Attach useful metadata
            for i, doc in enumerate(documents, 1):
                doc.metadata.update({
                    "source_file": file,
                    "chunk_number": i,
                    "page_number": i
                })

            print(f"✅ Loaded {file} with {len(documents)} chunks")
            all_documents.extend(documents)

        except Exception as e:
            print(f"⚠️ Could not load {file}, skipping... ({e})")
            continue

    print("📦 Total documents loaded:", len(all_documents))

    # ✂️ Split into smaller overlapping chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100
    )
    split_documents = text_splitter.split_documents(all_documents)
    print(f"✂️ After splitting: {len(split_documents)} chunks")

    # 🔍 Create FAISS index
    if split_documents:
        vectorstore = FAISS.from_documents(split_documents, embedding)
        vectorstore.save_local(INDEX_PATH)
        print("💾 FAISS index created and saved for Wedlii docs with chunking!")
    else:
        print("⚠️ No valid documents found. Please add PDFs/DOCX files.")

✅ Loading existing FAISS index...


In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

if not all_documents:
    print(f"⚠️ No valid PDF/DOCX files found in '{FOLDER_PATH}'.")
    raise SystemExit

# ✅ Ensure index folder exists
os.makedirs(INDEX_PATH, exist_ok=True)

# ✂️ Split into smaller overlapping chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     # each chunk ~500 characters
    chunk_overlap=100   # 100 characters overlap for context
)
split_documents = text_splitter.split_documents(all_documents)
print(f"✂️ After splitting: {len(split_documents)} chunks")

# 🔍 Build FAISS index for Wedlii
vectorstore = FAISS.from_documents(split_documents, embedding)
vectorstore.save_local(INDEX_PATH)

print("✅ FAISS index created and saved for Wedlii documents (with chunking)!")


✂️ After splitting: 63 chunks
✅ FAISS index created and saved for Wedlii documents (with chunking)!


In [32]:
# 🔍 Load FAISS index
vectorstore = FAISS.load_local(INDEX_PATH, embedding, allow_dangerous_deserialization=True)

# 📊 Get the total number of chunks in FAISS
total_chunks = len(vectorstore.index_to_docstore_id)
print(f"📦 Total chunks stored in Wedlii vectorstore: {total_chunks}")

📦 Total chunks stored in Wedlii vectorstore: 63


In [33]:
# ✂️ Split documents into smaller overlapping chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(all_documents)

# 🔍 Create FAISS index
vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local(INDEX_PATH)
print("✅ FAISS index created and saved with chunked Wedlii docs!")

# 🔎 Create retriever (retrieve top 8 chunks for better context)
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

# 🤵 Wedlii Assistant Prompt
policy_prompt = """
You are a Wedlii Knowledge Assistant.
Only answer questions using the information provided in Wedlii documents.
If the answer is not found, say: "I cannot find that information in the Wedlii docs."
Be clear, concise, and professional.
"""

✅ FAISS index created and saved with chunked Wedlii docs!


In [None]:
print("🤵 Wedlii Knowledge Chatbot ready! Type 'exit' to stop.\n")

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        print("👋 Goodbye! Thanks for chatting with Wedlii.")
        break

    # 🔹 Classify query into one Wedlii category
    classification_prompt = f"""
    You are a smart classifier. Classify the following query into ONE of these Wedlii categories:

    1. About Wedlii
    2. AI Wedding Visions
    3. BLOGS
    4. Bridal Hairstyle Types
    5. Bridal Makeup Types
    6. Deamy_Byron_Bay_Boho_Wedding_Styled_Shoot
    7. Vendors
    8. Venues
    9. Wedding Cake Types
    10. Wedding Flower Types
    11. Wedlii_Colour_Palette

    Query: "{query}"

    Respond ONLY with the exact category name.
    """

    classify_response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[{"role": "system", "content": classification_prompt}],
        max_tokens=10,
        temperature=0
    )

    category = classify_response.choices[0].message.content.strip()
    print(f"🔍 GPT says this query is related to: {category}")

    # 🔹 Retrieve relevant docs (no metadata filter since 'title' wasn't saved)
    relevant_docs = vectorstore.similarity_search(query, k=1)

    # 🔹 Build context from retrieved docs
    context_list = []
    for i, doc in enumerate(relevant_docs, 1):
        context_list.append(
            f"Chunk {i} Content:\n{doc.page_content}\nSource: {doc.metadata.get('source_file')}\n"
        )
    context = "\n".join(context_list)

    print("\n=== Retrieved Chunks ===")
    for doc in relevant_docs:
        print("Content:", doc.page_content[:80], "...")
        print("Metadata:", doc.metadata)
        print("-" * 30)

    # 🔹 Build conversation
    messages = [
        {"role": "system", "content": policy_prompt},
        {
            "role": "user",
            "content": f"Category: {category}\n\nContext:\n{context}\n\nQuestion: {query}"
        }
    ]

    # 🔹 Get final answer from ChatGPT
    response = client.chat.completions.create(
        model="gpt-4.1-nano",   # you can switch to "gpt-4o-mini" for better answers
        messages=messages,
        max_tokens=300,
        temperature=0.5
    )

    print("Bot:", response.choices[0].message.content)
    print("=" * 80)


🤵 Wedlii Knowledge Chatbot ready! Type 'exit' to stop.

You: what is Rustic Cakes
🔍 GPT says this query is related to: Wedding Cake Types

=== Retrieved Chunks ===
Content: Wedding  Cake  Types  
Classic  Cakes  
●  Traditional  and  elegantly  decorate ...
Metadata: {'producer': 'Skia/PDF m141 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Untitled document', 'source': '/content/wedlii_docs/Wedding Cake Types.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Wedding Cake Types.pdf', 'chunk_number': 1, 'page_number': 1}
------------------------------
Bot: Rustic Cakes are natural with rough edges, flowers, or greenery.
