In [None]:
# Cell 1: Install All Necessary Libraries

!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U langchain
!pip install -q -U langchain-community
!pip install -q -U sentence-transformers
!pip install -q -U faiss-cpu #






In [None]:
# Cell 2: Mount Google Drive, Define Paths, and Unzip the Vector Store

from google.colab import drive
import os
import zipfile

# Mount Google Drive
drive.mount('/content/drive')

# --- DEFINE ALL YOUR PATHS HERE ---
# Path to your zipped vector store in Google Drive
ZIP_PATH = '/content/drive/MyDrive/Colab_RAG_Project/vector_store.zip'
# Path where we will unzip the store in the Colab environment
UNZIP_PATH = '/content/vector_store'
# Path to the final unzipped FAISS index
VECTOR_STORE_PATH = '/content/vector_store/vector_store/db_faiss'
# Path to the folder where we will save our results
OUTPUT_PATH = '/content/drive/MyDrive/Colab_RAG_Project/outputs/'

# --- SETUP THE ENVIRONMENT ---
# Create the output directory in Drive if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Unzip the vector store file for faster access
print(f"Unzipping {ZIP_PATH} to {UNZIP_PATH}...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(UNZIP_PATH)
print("Unzipping complete.")



In [None]:
# Cell 3: Load Vector Store and Embedding Model

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

print("Loading vector store...")
db = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_kwargs={'k': 5})
print("Retriever is ready.")



In [None]:
# Cell 4: Setup the LLM and RAG Chain (Final, Robust Version)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables import RunnablePassthrough

print("\nSetting up the LLM...")
model_id = "microsoft/phi-2"

model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, temperature=0.1, top_p=0.95)
llm = HuggingFacePipeline(pipeline=pipe)
print("LLM setup complete.")

prompt_template = """
Instruct: You are a financial analyst assistant for CrediTrust. Use ONLY the following retrieved complaint excerpts to answer the question. If the context is not enough, say that you don't have enough information.

Context:
{context}

Question:
{question}

Output:
"""
prompt = PromptTemplate.from_template(prompt_template)

# This chain will take the 'question' and 'context' and generate an answer
combine_docs_chain = create_stuff_documents_chain(llm, prompt)

# This is the full chain that orchestrates everything
retrieval_chain = RunnablePassthrough.assign(
    context=(lambda x: x["question"]) | retriever
).assign(
    answer=combine_docs_chain
)

print("RAG Chain is ready.")

In [None]:
# Cell 5: Run Evaluation and Save All Results to Google Drive (Final Version)
# ... (the evaluation_questions list remains the same) ...

# --- Define the list of questions for your evaluation ---
evaluation_questions = [
    "Why are people unhappy with BNPL?",
    "What are the main issues with credit card fees?",
    "Are there complaints about unauthorized transactions on savings accounts?",
    "Summarize the problems related to money transfers being delayed.",
    "My loan application was rejected, what are common reasons mentioned in complaints?",
    "Compare the top complaints for Personal Loans versus Credit Cards.",
    "What do customers say about closing their accounts?",
    "Is fraud a common issue in money transfer services?"
]
for i, question in enumerate(evaluation_questions):
    print(f"\n--- Processing Question {i+1}/{len(evaluation_questions)} ---")
    print(f"Question: {question}")

    # We now pass a dictionary with the 'question' key
    response = retrieval_chain.invoke({"question": question})

    answer = response.get('answer', 'No answer generated.')
    context_docs = response.get('context', [])

    output_content = f"EVALUATION FOR QUESTION {i+1}\n"
    output_content += "="*40 + "\n"
    output_content += f"Question: {question}\n"
    output_content += "="*40 + "\n\n"
    output_content += "Generated Answer:\n"
    output_content += "-----------------\n"
    output_content += answer + "\n\n"
    output_content += "Retrieved Sources Used:\n"
    output_content += "-----------------------\n"

    for j, doc in enumerate(context_docs):
        output_content += f"\n--- Source {j+1} ---\n"
        output_content += f"Product: {doc.metadata.get('product', 'N/A')}\n"
        output_content += f"Text Snippet: {doc.page_content}\n"
        output_content += "------------------\n"

    output_filename = f"evaluation_q{i+1}_{question[:20].replace(' ', '_')}.txt"
    output_filepath = os.path.join(OUTPUT_PATH, output_filename)

    with open(output_filepath, 'w', encoding='utf-8') as f:
        f.write(output_content)

    print(f"âœ… Results for Question {i+1} saved to: {output_filepath}")

print("\n\nAll evaluation questions have been processed and saved to your Google Drive in the 'outputs' folder.")