# CSR RAG Project - Step 3: RAG Chatbot (FAISS Version)
# This notebook builds the final RAG chatbot using Groq API and FAISS

# [markdown]
## Setup and Imports

In [1]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


True

# [markdown]
## Configuration

In [2]:
VECTOR_DB_PATH = "faiss_index"
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
GROQ_MODEL = "llama-3.3-70b-versatile"
TOP_K_RESULTS = 4

#  [markdown]
## Setup Groq API Key

In [3]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    print("‚ö†Ô∏è  WARNING: GROQ_API_KEY not found!")
    print("\nPlease:")
    print("1. Get free API key from: https://console.groq.com/")
    print("2. Create a .env file with: GROQ_API_KEY=your_key_here")
    print("3. Restart this notebook")
else:
    print("‚úÖ Groq API key loaded!")

‚úÖ Groq API key loaded!


# [markdown]
## Load Vector Database

In [4]:
print("Loading vector database...")

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

vectorstore = FAISS.load_local(
    VECTOR_DB_PATH, 
    embeddings,
    allow_dangerous_deserialization=True
)

print(f"‚úÖ Vector database loaded!")

Loading vector database...


  embeddings = HuggingFaceEmbeddings(


‚úÖ Vector database loaded!


# [markdown]
## Create Custom Prompt Template

In [5]:
template = """You are a helpful assistant specialized in Corporate Social Responsibility (CSR) information for Indonesian FMCG companies.

Use the following context from CSR reports to answer the question. The context includes information from various companies' annual reports and sustainability reports.

Context:
{context}

Question: {question}

Instructions:
- Answer based on the provided context
- If the information is from a specific year or company, mention it
- If the context doesn't contain enough information, say so honestly
- You can answer in English or Indonesian, matching the user's question language
- Be specific and cite which company/year when relevant
- If multiple companies have similar programs, compare them

Answer:"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

print("‚úÖ Custom prompt template created")

‚úÖ Custom prompt template created


#  [markdown]
## Initialize Groq Client

In [6]:
groq_client = Groq(api_key=GROQ_API_KEY)

print(f"‚úÖ Groq client initialized")
print(f"   Model: {GROQ_MODEL}")

‚úÖ Groq client initialized
   Model: llama-3.3-70b-versatile


# [markdown]
## Create RAG Query Function

In [7]:
def query_csr_chatbot(question, top_k=TOP_K_RESULTS, verbose=True):
    """
    Query the CSR chatbot with RAG
    """
    
    if verbose:
        print("\n" + "=" * 60)
        print(f"üîç Question: {question}")
        print("=" * 60)
    
    # Retrieve relevant chunks
    if verbose:
        print(f"\nüìö Retrieving top {top_k} relevant documents...")
    
    relevant_docs = vectorstore.similarity_search(question, k=top_k)
    
    if verbose:
        print(f"‚úÖ Found {len(relevant_docs)} relevant chunks:\n")
        for i, doc in enumerate(relevant_docs, 1):
            print(f"   {i}. {doc.metadata['company']} {doc.metadata['year']} "
                  f"(Chunk {doc.metadata['chunk_index']+1}/{doc.metadata['total_chunks']})")
    
    # Prepare context
    context = "\n\n---\n\n".join([
        f"Source: {doc.metadata['company']} {doc.metadata['year']}\n{doc.page_content}"
        for doc in relevant_docs
    ])
    
    # Create prompt
    prompt_text = PROMPT.format(context=context, question=question)
    
    # Query Groq
    if verbose:
        print(f"\nü§ñ Querying {GROQ_MODEL}...")
    
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant specialized in Corporate Social Responsibility information for Indonesian FMCG companies."
                },
                {
                    "role": "user",
                    "content": prompt_text
                }
            ],
            model=GROQ_MODEL,
            temperature=0.3,
            max_tokens=1000
        )
        
        answer = chat_completion.choices[0].message.content
        
        response = {
            "question": question,
            "answer": answer,
            "sources": [
                {
                    "company": doc.metadata['company'],
                    "year": doc.metadata['year'],
                    "chunk_id": doc.metadata['chunk_id']
                }
                for doc in relevant_docs
            ],
            "raw_docs": relevant_docs
        }
        
        if verbose:
            print("\n" + "=" * 60)
            print("üí° ANSWER:")
            print("=" * 60)
            print(answer)
            print("\nüìé Sources:")
            for src in response['sources']:
                print(f"   - {src['company']} {src['year']}")
        
        return response
    
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        return None

print("‚úÖ RAG query function ready!")

‚úÖ RAG query function ready!


# [markdown]
## Test the Chatbot

In [8]:
# Test Query 1
response1 = query_csr_chatbot(
    "What is Unilever's CSR program in 2022?"
)

# %%
# Test Query 2
response2 = query_csr_chatbot(
    "What are the water conservation initiatives across all companies?"
)

# %%
# Test Query 3
response3 = query_csr_chatbot(
    "Apa program tanggung jawab sosial Indofood untuk pendidikan?"
)

# %%
# Test Query 4
response4 = query_csr_chatbot(
    "Compare energy efficiency programs between Danone and Mayora"
)


üîç Question: What is Unilever's CSR program in 2022?

üìö Retrieving top 4 relevant documents...
‚úÖ Found 4 relevant chunks:

   1. Unilever 2020 (Chunk 272/331)
   2. Unilever 2022 (Chunk 303/472)
   3. Unilever 2021 (Chunk 10/428)
   4. Unilever 2023 (Chunk 109/498)

ü§ñ Querying llama-3.3-70b-versatile...

üí° ANSWER:
Berdasarkan konteks yang disediakan, program CSR Unilever pada tahun 2022 mencakup beberapa aspek, antara lain:

1. Peningkatan produktivitas dan pengelolaan sampah: Unilever membantu beberapa pengepul untuk meningkatkan produktivitas dan mendorong peningkatan laju daur ulang sampah secara keseluruhan, seperti yang dilaporkan pada tahun 2020 (Unilever 2020).
2. Pemasaran dan pelabelan produk: Unilever tidak memiliki insiden signifikan terkait ketidakpatuhan terhadap peraturan pemasaran dan pelabelan pada tahun 2022 (Unilever 2022). Label kemasan produk es krim Unilever Indonesia juga mencakup informasi tentang nilai gizi, petunjuk penggunaan, dan jumlah kandunga

#  [markdown]
## Interactive Chat Loop

In [9]:
# %%
def interactive_chat():
    """
    Interactive chat loop
    """
    print("\n" + "=" * 60)
    print("CSR CHATBOT - INTERACTIVE MODE")
    print("=" * 60)
    print("Ask questions about CSR programs of:")
    print("  - Danone, Indofood, Mayora, Ultra Jaya, Unilever")
    print("  - Years: 2019-2023")
    print("\nType 'quit' or 'exit' to stop\n")
    
    while True:
        try:
            question = input("You: ").strip()
            
            if question.lower() in ['quit', 'exit', 'q']:
                print("\nüëã Goodbye!")
                break
            
            if not question:
                continue
            
            response = query_csr_chatbot(question, verbose=True)
            
            print("\n" + "-" * 60 + "\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Goodbye!")
            break
        except Exception as e:
            print(f"\n‚ùå Error: {e}\n")
# Uncomment to run
# interactive_chat()


#  [markdown]
## Advanced: Filter by Metadata



In [10]:
def query_with_filter(question, company=None, year=None, top_k=4):
    """
    Query with manual filtering by company/year
    Note: FAISS doesn't have built-in metadata filtering,
    so we filter results after retrieval
    """
    
    print(f"\nüîç Question: {question}")
    if company or year:
        print(f"üìå Filters: company={company}, year={year}")
    
    # Retrieve more results than needed
    relevant_docs = vectorstore.similarity_search(question, k=top_k*3)
    
    # Manual filtering
    filtered_docs = []
    for doc in relevant_docs:
        if company and doc.metadata['company'] != company:
            continue
        if year and doc.metadata['year'] != year:
            continue
        filtered_docs.append(doc)
        if len(filtered_docs) >= top_k:
            break
    
    if not filtered_docs:
        print("‚ùå No documents found matching the filters")
        return None
    
    print(f"‚úÖ Found {len(filtered_docs)} relevant chunks")
    
    context = "\n\n---\n\n".join([
        f"Source: {doc.metadata['company']} {doc.metadata['year']}\n{doc.page_content}"
        for doc in filtered_docs
    ])
    
    prompt_text = PROMPT.format(context=context, question=question)
    
    chat_completion = groq_client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a CSR information assistant."},
            {"role": "user", "content": prompt_text}
        ],
        model=GROQ_MODEL,
        temperature=0.3,
        max_tokens=1000
    )
    
    answer = chat_completion.choices[0].message.content
    
    print("\nüí° ANSWER:")
    print("=" * 60)
    print(answer)
    
    return answer

# Example
print("\n--- Filter by Unilever only ---")
query_with_filter(
    "What are the sustainability initiatives?",
    company="Unilever"
)

# %%
print("\n--- Filter by 2023 only ---")
query_with_filter(
    "What are the latest CSR programs?",
    year=2023
)


--- Filter by Unilever only ---

üîç Question: What are the sustainability initiatives?
üìå Filters: company=Unilever, year=None
‚úÖ Found 2 relevant chunks

üí° ANSWER:
Berdasarkan konteks yang disediakan, beberapa inisiatif keberlanjutan yang dapat disebutkan adalah:

1. Pengurangan sampah plastik: Unilever (2024) melalui Yayasan Rumah Pelangi, mempromosikan kesadaran akan pentingnya pengurangan sampah plastik dan melakukan pencatatan yang akurat terhadap jumlah sampah plastik yang terkumpul.
2. Edukasi dan pendampingan: Unilever (2024) juga menyusun strategi edukasi dan pendampingan yang lebih cermat bagi mitra pengumpul sampah.
3. Pendanaan untuk wastepreneur: Unilever (2024), bersama dengan USAID dan EY, berkolaborasi untuk memberikan pendanaan kepada wiraswasta dan UMKM yang bergerak di bidang limbah plastik (wastepreneur) untuk memperluas usaha mereka dalam pengurangan penggunaan kemasan plastik dan mengatasi limbah plastik.
4. Efisiensi energi: Unilever (2023) melanjutkan b

'Berdasarkan konteks yang disediakan, beberapa program CSR terbaru dari perusahaan FMCG di Indonesia adalah:\n\n1. **Pengurangan Emisi**: Indofood (2023) telah mengimplementasikan beberapa inisiatif untuk mengurangi emisi, seperti pemanfaatan smart sensor, teknologi inverter, dan motor dengan efisiensi tinggi untuk mengurangi konsumsi listrik dan emisi Scope 2. Mereka juga menggunakan mesin forklift bertenaga baterai untuk mengurangi emisi Scope 1 dari pembakaran bahan bakar cair.\n2. **Keselamatan dan Kesehatan Kerja**: Unilever (2023) telah mengimplementasikan program CSMS (Contractor Safety Management System) untuk meningkatkan keselamatan dan kesehatan kerja kontraktor. Mereka juga memiliki target "Vision Zero" untuk mencegah kematian dan cedera akibat kecelakaan kerja.\n3. **Pengembangan Teknologi**: Unilever (2023) telah mengimplementasikan Cloud Computer Telephony Integration (CTI) untuk meningkatkan pengalaman konsumen melalui panggilan telepon. Teknologi ini juga membantu mema

#  [markdown]
## Summary

In [11]:
print("\n" + "=" * 60)
print("‚úÖ CSR RAG CHATBOT COMPLETE!")
print("=" * 60)
print("""
Your chatbot is ready! You can:

1. Use query_csr_chatbot() for queries
2. Run interactive_chat() for testing
3. Use query_with_filter() for filtered searches

Example:
  response = query_csr_chatbot("What is Danone's CSR?")
  print(response['answer'])
""")


‚úÖ CSR RAG CHATBOT COMPLETE!

Your chatbot is ready! You can:

1. Use query_csr_chatbot() for queries
2. Run interactive_chat() for testing
3. Use query_with_filter() for filtered searches

Example:
  response = query_csr_chatbot("What is Danone's CSR?")
  print(response['answer'])

