In [1]:
!pip install --upgrade datasets fsspec huggingface_hub

from IPython import get_ipython
from IPython.display import display
from datasets import load_dataset

dataset = load_dataset("Abirate/english_quotes", split="train")
dataset = dataset.filter(lambda x: x["quote"] and x["author"])
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1000))

dataset[0]

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.32.0-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.32.0-py3-none-any.whl (509 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.3/509.3 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2508 [00:00<?, ? examples/s]

{'quote': "“I don't mind making jokes, but I don't want to look like one.”",
 'author': 'Marilyn Monroe',
 'tags': ['appearance', 'jokes', 'marilyn-monroe']}

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
quotes_list = dataset["quote"]
quote_embeddings = model.encode(quotes_list, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [4]:
!pip install faiss-cpu
import faiss
import numpy as np

embedding_dim = quote_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(quote_embeddings))

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [5]:
def retrieve_quotes(query, dataset, top_k=5):
        query_embedding = model.encode([query])
        scores, indices = index.search(np.array(query_embedding), top_k)

        results = []

        quotes = dataset["quote"]
        authors = dataset["author"]

        for i in indices[0]:
            results.append({
                "quote": quotes[i],
                "author": authors[i],

            })
        return results

# sample:
retrieve_quotes("funny quotes about life", dataset)

[{'quote': '“love the life you live.live the life you love.”',
  'author': 'Bob Marley'},
 {'quote': '“When I was 5 years old, my mother always told me that happiness was the key to life. When I went to school, they asked me what I wanted to be when I grew up. I wrote down â€˜happyâ€™. They told me I didnâ€™t understand the assignment, and I told them they didnâ€™t understand life.”',
  'author': 'John Lennon'},
 {'quote': '“People say that life is the thing, but I prefer reading.”',
  'author': 'Logan Pearsall Smith'},
 {'quote': '“If my life is going to mean anything, I have to live it myself.”',
  'author': 'Rick Riordan,'},
 {'quote': '“The purpose of life is not to be happy. It is to be useful, to be honorable, to be compassionate, to have it make some difference that you have lived and lived well.”',
  'author': 'Ralph Waldo Emerson'}]

In [14]:
def format_as_rag_response(query, top_k=5):
    results = retrieve_quotes(query, top_k)
    response = f"🔍 **Query:** {query}\n\n"
    for r in results:
        response += f"> *{r['quote']}*  — **{r['author']}**  \n"
    return response

In [16]:
!pip install streamlit
import streamlit as st

st.title("Semantic Quote Search (RAG Style)")
query = st.text_input("Enter a query:", "")

if query:
    results = retrieve_quotes(query, top_k=5)
    for r in results:
        st.markdown(f"> *{r['quote']}*  \n\n— **{r['author']}**")
        st.markdown("---")

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInst

2025-05-25 20:37:27.389 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-05-25 20:37:27.397 Session state does not function when running a script without `streamlit run`


In [3]:
%%writefile app.py

Overwriting app.py


In [4]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.196.41.145:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
