In [1]:
from IPython.display import clear_output
!pip install -U datasets huggingface_hub fsspec
!pip install sentence-transformers
!pip install faiss-cpu
!pip install ragas ragas[metrics]
!pip install streamlit
clear_output()

In [2]:
!pip install langchain langchain-core langchain-community langchain[google-genai]
clear_output()

# Data Preparation

In [3]:
import pandas as pd
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    """Preprocess text: lowercase, remove special characters, tokenize, remove stopwords."""
    if not isinstance(text, str) or pd.isna(text):
        return ""  # Return empty string for non-string or NaN values
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens) if tokens else ""

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def load_and_preprocess_data():
    """Load and preprocess the Abirate/english_quotes dataset."""
    # Load dataset
    dataset = load_dataset("Abirate/english_quotes")
    df = pd.DataFrame(dataset['train'])
    
    # Handle missing values
    df = df.dropna(subset=['quote', 'author', 'tags'])
    
    # Convert tags to string if they are lists
    df['tags'] = df['tags'].apply(lambda x: x if isinstance(x, str) else ', '.join(x))
    
    # Preprocess quote text
    df['processed_quote'] = df['quote'].apply(preprocess_text)
    
    # Remove rows with empty processed quotes
    df = df[df['processed_quote'].str.strip() != '']
    
    # Ensure author and tags are strings
    df['author'] = df['author'].astype(str)
    df['tags'] = df['tags'].astype(str)
    
    # Save preprocessed data
    df.to_csv('preprocessed_quotes.csv', index=False)
    return df

In [5]:
df = load_and_preprocess_data()
print(f"Loaded and preprocessed {len(df)} quotes.")

README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Loaded and preprocessed 2505 quotes.


In [6]:
df.head()

Unnamed: 0,quote,author,tags,processed_quote
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"be-yourself, gilbert-perreira, honesty, inspir...",everyone else already taken
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"best, life, love, mistakes, out-of-control, tr...",im selfish impatient little insecure make mist...
2,“Two things are infinite: the universe and hum...,Albert Einstein,"human-nature, humor, infinity, philosophy, sci...",two things infinite universe human stupidity i...
3,"“So many books, so little time.”",Frank Zappa,"books, humor",many books little time
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"books, simile, soul",room without books like body without soul


In [7]:
df.isna().sum()

quote              0
author             0
tags               0
processed_quote    0
dtype: int64

In [8]:
df[df['author'] == 'Albert Einstein']

Unnamed: 0,quote,author,tags,processed_quote
2,“Two things are infinite: the universe and hum...,Albert Einstein,"human-nature, humor, infinity, philosophy, sci...",two things infinite universe human stupidity i...
34,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles",two ways live life one though nothing miracle ...
55,“I am enough of an artist to draw freely upon ...,Albert Einstein,"1929, imagination, inspirational, viereck-inte...",enough artist draw freely upon imagination ima...
73,"“If you can't explain it to a six year old, yo...",Albert Einstein,"simplicity, understand",cant explain six year old dont understand
97,"“If you want your children to be intelligent, ...",Albert Einstein,"children, fairy-tales",want children intelligent read fairy tales wan...
99,“Logic will get you from A to Z; imagination w...,Albert Einstein,imagination,logic get z imagination get everywhere
124,“Life is like riding a bicycle. To keep your b...,Albert Einstein,"life, simile",life like riding bicycle keep balance must kee...
180,“Anyone who has never made a mistake has never...,Albert Einstein,mistakes,anyone never made mistake never tried anything...
245,"“I speak to everyone in the same way, whether ...",Albert Einstein,"life, respect",speak everyone way whether garbage man preside...
323,“Never memorize something that you can look up.”,Albert Einstein,"humor, science",never memorize something look


# Model Fine-Tuning

In [13]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

def generate_train_examples(df):
    """
    Generate semantic training pairs from the quotes dataset.
    Each pair consists of a natural language query and its expected response.
    """
    train_examples = []

    for _, row in df.iterrows():

        # Validate inputs
        if not isinstance(row['quote'], str) or not row['quote'].strip():
            continue
        if not isinstance(row['author'], str) or not row['author'].strip():
            continue
        if not isinstance(row['tags'], str) or not row['tags'].strip():
            continue
            
        quote = row["quote"]
        author = row["author"]
        tags = row["tags"]

        # Normalize tags
        tag_string = ", ".join(tags) if isinstance(tags, list) else str(tags)

        # Create the answer string
        target = f"{quote} — {author} | {tag_string}"

        queries = []

         # Rich query generation logic
        if author and tag_string:
            queries.extend([
                f"quotes about {tag_string} by {author}",
                f"{author} quotes tagged {tag_string}",
                f"what did {author} say about {tag_string}?",
                f"{tag_string} quotes attributed to {author}",
                f"show me {author}'s quotes related to {tag_string}"
            ])
        elif tag_string:
            queries.extend([
                f"quotes about {tag_string}",
                f"{tag_string} quotes",
                f"quotes tagged with {tag_string}",
                f"show me quotes related to {tag_string}",
                f"popular {tag_string} quotes"
            ])
        elif author:
            queries.extend([
                f"quotes by {author}",
                f"what did {author} say?",
                f"{author} quotes",
                f"famous quotes by {author}",
                f"show me {author}'s sayings"
            ])
        else:
            queries.extend([
                "famous quotes",
                "popular quotes",
                "inspirational quotes",
                "show me some quotes",
                "random quotes"
            ])

        # Create InputExamples
        for q in queries:
            train_examples.append(InputExample(texts=[q, target], label=1.0))

    return train_examples

In [14]:
def fine_tune_model():
    """Fine-tune a sentence embedding model."""
    # Load preprocessed data
    df = pd.read_csv('preprocessed_quotes.csv')
    
    # Validate DataFrame
    if df.empty:
        raise ValueError("Preprocessed DataFrame is empty. Check data_preparation.py.")
    
    # Prepare training examples
    train_examples = generate_train_examples(df)
    if not train_examples:
        raise ValueError("No valid training examples generated. Check input data.")
    
    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Define data loader
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    
    # Define loss function
    train_loss = losses.CosineSimilarityLoss(model)
    
    # Fine-tune
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3,
        warmup_steps=100,
        output_path='./fine_tuned_model'
    )
    print("Model fine-tuned and saved to './fine_tuned_model'.")

In [15]:
df = pd.read_csv('preprocessed_quotes.csv')
train_examples = generate_train_examples(df)
print(train_examples[0].texts)
print(train_examples[1].texts)
print(train_examples[2].texts)

['quotes about be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator by Oscar Wilde', '“Be yourself; everyone else is already taken.” — Oscar Wilde | be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator']
['Oscar Wilde quotes tagged be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator', '“Be yourself; everyone else is already taken.” — Oscar Wilde | be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator']
['what did Oscar Wilde say about be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator?', '“Be yourself; everyone else is already taken.” — Oscar Wilde | be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator']


In [16]:
import os
os.environ["WANDB_DISABLED"] = "true"

fine_tune_model()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078
1000,0.0001
1500,0.0001
2000,0.0


Model fine-tuned and saved to './fine_tuned_model'.


# Build the RAG Pipeline with LangChain

In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
# from langchain.llms import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd

def create_vector_store(df, model_path='./fine_tuned_model'):
    """Create a FAISS vector store from quotes."""
    embeddings = HuggingFaceEmbeddings(model_name=model_path)

    # Build rich text representation: Quote — Author | Tags
    texts = [
        f"{row['quote']} — {row['author']} | {', '.join(row['tags']) if isinstance(row['tags'], list) else row['tags']}"
        for _, row in df.iterrows()
    ]

    # Add metadata (optional, for reference and display)
    metadatas = [
        {"quote": row['quote'], "author": row['author'], "tags": row['tags']}
        for _, row in df.iterrows()
    ]

    # Create vector store
    vector_store = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
    print(vector_store)
    vector_store.save_local("faiss_index")
    return vector_store

In [18]:
df = pd.read_csv('preprocessed_quotes.csv')
vector_store = create_vector_store(df)

  embeddings = HuggingFaceEmbeddings(model_name=model_path)


<langchain_community.vectorstores.faiss.FAISS object at 0x7abfe17b0b90>


In [19]:
vector_store.get_by_ids([vector_store.index_to_docstore_id[0]])

[Document(id='6d07b866-bfba-46b6-a354-5db1e8ec9b5a', metadata={'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': 'be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator'}, page_content='“Be yourself; everyone else is already taken.” — Oscar Wilde | be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator')]

In [20]:
import getpass
import os

from langchain.chat_models import init_chat_model

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")


def setup_rag_pipeline():
    """Set up the RAG pipeline with LangChain."""
    df = pd.read_csv('preprocessed_quotes.csv')
    vector_store = create_vector_store(df)

    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    prompt = ChatPromptTemplate.from_template(
        """Answer the user's query based on the context below.
        If possible, return a **direct quote** from the context.
        
        <context>
        {context}
        </context>
        
        Question: {input}
        Answer:"""
    )

    # llm = ChatGoogleGenerativeAI(
    #     model="models/gemini-1.5-flash-latest",  
    #     temperature=0.7
    # )
    # Load LLM (e.g., Llama-3)
    # model_name = "meta-llama/Llama-3-8b"  
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    # model = AutoModelForCausalLM.from_pretrained(model_name)
    # llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=200)
    # llm = HuggingFacePipeline(pipeline=llm_pipeline)
    llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

    combine_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    rag_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=combine_chain)
    return rag_chain

Enter API key for Google Gemini:  ········


In [21]:
def query_rag(rag_chain, query):
    """Query the RAG pipeline and return structured output."""
    result = rag_chain.invoke({"input": query})
    context_docs = result.get("context", []) if isinstance(result, dict) else []

    response = {
        "answer": result['answer'] if isinstance(result, dict) else result,
        "source_quotes": [
            {
                "quote": doc.metadata.get('quote', ''),
                "author": doc.metadata.get('author', ''),
                "tags": doc.metadata.get('tags', ''),
            }
            for doc in context_docs
        ]
    }
    return response

In [22]:
rag_chain = setup_rag_pipeline()
queries = [
    "Quotes about insanity attributed to Einstein",
    "Motivational quotes tagged 'accomplishment'",
    "All Oscar Wilde quotes with humor",
    "Quotes about honesty attributed by Oscar Wilde"
]
for query in queries:
    result = query_rag(rag_chain, query)
    print(f"Query: {query}")
    print(f"Response: {result}\n")

<langchain_community.vectorstores.faiss.FAISS object at 0x7abfda189b90>
Query: Quotes about insanity attributed to Einstein
Response: {'answer': '"A question that sometimes drives me hazy: am I or are the others crazy?” — Albert Einstein | crazy, question, thought-provoking', 'source_quotes': [{'quote': '“You never fail until you stop trying.”', 'author': 'Albert Einstein', 'tags': 'inspirational'}, {'quote': '“Anyone who has never made a mistake has never tried anything new.”', 'author': 'Albert Einstein', 'tags': 'mistakes'}, {'quote': '“Black holes are where God divided by zero.”', 'author': 'Albert Einstein', 'tags': 'humor'}, {'quote': '“A clever person solves a problem. A wise person avoids it.”', 'author': 'Albert Einstein', 'tags': 'misattributed-to-einstein'}, {'quote': '“A question that sometimes drives me hazy: am I or are the others crazy?”', 'author': 'Albert Einstein', 'tags': 'crazy, question, thought-provoking'}]}

Query: Motivational quotes tagged 'accomplishment'
Resp

# RAG Evaluation with RAGAS

In [23]:
from ragas import evaluate
from ragas.metrics import faithfulness, context_precision
from datasets import Dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, util

def evaluate_rag_pipeline(rag_chain):
    """Evaluate the RAG pipeline using RAGAS + semantic answer relevancy."""
    
    queries = [
        "Quotes about insanity attributed to Einstein",
        "Motivational quotes tagged 'accomplishment'",
        "All Oscar Wilde quotes with humor"
    ]

    ground_truths = [
        "A question that sometimes drives me hazy: am I or are the others crazy?",
        "The way to get started is to quit talking and begin doing.",
        "No good deed goes unpunished."
    ]

    rag_data = {
        "question": [],
        "answer": [],
        "contexts": [],
        "ground_truth": ground_truths
    }

    for query in queries:
        result = rag_chain.invoke({"input": query})

        rag_data["question"].append(query)
        rag_data["answer"].append(result["answer"])
        context_docs = result.get("context", result.get("documents", []))
        rag_data["contexts"].append([doc.page_content for doc in context_docs])

    # Convert to HuggingFace dataset
    dataset = Dataset.from_dict(rag_data)

    # Initialize Gemini model for RAGAS (used for faithfulness/context_precision)
    llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

    # Run RAGAS evaluation (excluding answer_relevancy to be computed separately)
    results = evaluate(
        dataset,
        metrics=[faithfulness, context_precision],
        llm=llm
    )

    # Convert RAGAS results to DataFrame
    df = results.to_pandas()

    # Compute semantic answer relevancy separately
    model = SentenceTransformer('./fine_tuned_model')
    relevancy_scores = []

    for a, gt in zip(rag_data["answer"], rag_data["ground_truth"]):
        if not a.strip() or not gt.strip():
            relevancy_scores.append(float("nan"))
            continue
        emb_a = model.encode(a, convert_to_tensor=True)
        emb_gt = model.encode(gt, convert_to_tensor=True)
        sim_score = util.cos_sim(emb_a, emb_gt).item()
        relevancy_scores.append(sim_score)

    df["semantic_answer_relevancy"] = relevancy_scores

    # Save results
    df.to_csv("rag_evaluation_with_semantic_scores.csv", index=False)
    print("✅ Evaluation saved to 'rag_evaluation_with_semantic_scores.csv'")
    return df


In [24]:
import os
os.environ["OPENAI_API_KEY"] = "sk-fake-key"

rag_chain = setup_rag_pipeline()
df = evaluate_rag_pipeline(rag_chain)

df

<langchain_community.vectorstores.faiss.FAISS object at 0x7abf0b566610>


Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Evaluation saved to 'rag_evaluation_with_semantic_scores.csv'


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,context_precision,semantic_answer_relevancy
0,Quotes about insanity attributed to Einstein,[“You never fail until you stop trying.” — Alb...,“A question that sometimes drives me hazy: am ...,A question that sometimes drives me hazy: am I...,1.0,0.2,0.949228
1,Motivational quotes tagged 'accomplishment',[“You never fail until you stop trying.” — Alb...,“To be yourself in a world that is constantly ...,The way to get started is to quit talking and ...,1.0,0.5,0.769861
2,All Oscar Wilde quotes with humor,[“I have nothing to declare except my genius.”...,* “I have nothing to declare except my geniu...,No good deed goes unpunished.,1.0,0.5,0.938424


In [25]:
import os
import zipfile

files_to_download = [
    '/kaggle/working/faiss_index',
    '/kaggle/working/fine_tuned_model',
    '/kaggle/working/preprocessed_quotes.csv',
    '/kaggle/working/rag_evaluation_with_semantic_scores.csv',
]

# Create a zip
with zipfile.ZipFile('model.zip', 'w') as zipf:
    for path in files_to_download:
        if os.path.isdir(path):
            for foldername, _, filenames in os.walk(path):
                for filename in filenames:
                    file_path = os.path.join(foldername, filename)
                    arcname = os.path.relpath(file_path, '/kaggle/working')
                    zipf.write(file_path, arcname)
        else:
            arcname = os.path.basename(path)
            zipf.write(path, arcname)
