In [1]:
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu tf-keras
!pip install -q torch sentence-transformers scikit-learn faiss-cpu
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from IPython.display import display
from sklearn.model_selection import train_test_split


Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
[0mFound existing installation: tf_keras 2.18.0
Uninstalling tf_keras-2.18.0:
  Successfully uninstalled tf_keras-2.18.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit


In [10]:
path = '/kaggle/input/book-summaries/books_summary.csv'
df = pd.read_csv(path)

print("Columns:", df.columns.tolist())
df.head(5)

Columns: ['Unnamed: 0', 'book_name', 'summaries', 'categories']


Unnamed: 0.1,Unnamed: 0,book_name,summaries,categories
0,0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,3,Brave New World,presents a futuristic society engineered perf...,science
4,4,1984,is the story of a man questioning the system ...,science


In [19]:
subset = (
    df[['book_name','summaries','categories']]
    .dropna()
    .reset_index(drop=True)
)
subset['text'] = (
    subset['book_name'].str.strip()
  + ". " + subset['summaries'].str.strip()
  + " Genre: " + subset['categories'].str.strip()
)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIdx, testIdx in sss.split(subset, subset['categories']):
    trainDf = subset.loc[trainIdx].reset_index(drop=True)
    testDf  = subset.loc[testIdx].reset_index(drop=True)

print(f"Train size: {len(trainDf)}, Test size: {len(testDf)}")

Train size: 4155, Test size: 1039


In [20]:

modelNames = [
    'all-MiniLM-L6-v2',
    'paraphrase-MiniLM-L6-v2',
    'all-mpnet-base-v2'
]
models, indexes = {}, {}

for name in modelNames:
    m = SentenceTransformer(name)
    emb = m.encode(
        trainDf['text'].tolist(),
        batch_size=128,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    emb_norm = emb / np.linalg.norm(emb, axis=1, keepdims=True)
    dim = emb_norm.shape[1]
    idx = faiss.IndexFlatIP(dim)
    idx.add(emb_norm)
    models[name]  = m
    indexes[name] = idx



Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

In [21]:
def search_books_faiss(
    query: str,
    modelName: str = 'all-mpnet-base-v2',
    topK: int = 5,
    minScore: float = 0.1
) -> pd.DataFrame:

    qEmb = models[modelName].encode([query], convert_to_numpy=True)
    qEmb = qEmb / np.linalg.norm(qEmb, axis=1, keepdims=True)

 
    D, I = indexes[modelName].search(qEmb, topK * 2)


    Idxs, Scores = [], []
    for dist, idx in zip(D[0], I[0]):
        if dist >= minScore:
            Idxs.append(idx)
            Scores.append(dist)
        if len(Idxs) == topK:
            break

    
    results = trainDf.iloc[Idxs][['book_name','categories','summaries']].copy()
    results['score'] = Scores
    return results.reset_index(drop=True)


In [22]:
queries = ["space exploration", "business strategy", "historical romance"]
for q in queries:
    print(f"\nQuery: {q!r} ")
    for name in modelNames:
        print(f"Model: {name} ")
        display(search_books_faiss(q, modelName=name, topK=3))
    print("\n" + "=")



Query: 'space exploration' 
Model: all-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,An Astronaut’s Guide To Life On Earth,happiness,teaches you how to live better by taking less...,0.489332
1,Moonwalking With Einstein,science,not only educates you about the history of me...,0.450684
2,A Short History Of Nearly Everything,science,explains everything we’ve learned about our w...,0.436622


Model: paraphrase-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,An Astronaut’s Guide To Life On Earth,happiness,teaches you how to live better by taking less...,0.485627
1,Astrophysics for People in a Hurry,environment,"talks about the laws of nature, physics, astr...",0.453235
2,Astrophysics for People in a Hurry,education,"talks about the laws of nature, physics, astr...",0.446955


Model: all-mpnet-base-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,An Astronaut’s Guide To Life On Earth,happiness,teaches you how to live better by taking less...,0.474891
1,Astrophysics for People in a Hurry,education,"talks about the laws of nature, physics, astr...",0.466278
2,Astrophysics for People in a Hurry,science,"talks about the laws of nature, physics, astr...",0.459441



=

Query: 'business strategy' 
Model: all-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,Nail It Then Scale It,business,teaches you how to craft the perfect business...,0.647665
1,Blue Ocean Strategy,business,talks about a new type of business strategy t...,0.604949
2,Blue Ocean Strategy,management,talks about a new type of business strategy t...,0.595512


Model: paraphrase-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,The Sales Advantage,business,offers a practical guide to acquiring custome...,0.627553
1,The Sales Advantage,management,offers a practical guide to acquiring custome...,0.618783
2,In Search Of Excellence,management,"is a study of America’s top 15 companies, rev...",0.608138


Model: all-mpnet-base-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,Nail It Then Scale It,business,teaches you how to craft the perfect business...,0.584601
1,Blue Ocean Strategy,work,talks about a new type of business strategy t...,0.578191
2,Blue Ocean Strategy,business,talks about a new type of business strategy t...,0.569082



=

Query: 'historical romance' 
Model: all-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,Labor of Love,relationships,illustrates the history of modern dating as w...,0.544944
1,Labor of Love,communication,illustrates the history of modern dating as w...,0.530732
2,The Business Romantic,relationships,shows how doing business that is focused on p...,0.529987


Model: paraphrase-MiniLM-L6-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,Labor of Love,relationships,illustrates the history of modern dating as w...,0.569214
1,Labor of Love,communication,illustrates the history of modern dating as w...,0.562143
2,The Business Romantic,relationships,shows how doing business that is focused on p...,0.531164


Model: all-mpnet-base-v2 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,book_name,categories,summaries,score
0,The Year of Magical Thinking,relationships,,0.567115
1,Don Quixote,business,is a classic novel from 1605 which portraits ...,0.504461
2,A Tale of Two Cities,motivation,tells the stories of two connected families i...,0.496174



=


In [23]:
N = 50
sampled = testDf.sample(N, random_state=42).reset_index(drop=True)

testSet = [
    (row['book_name'], [row['book_name']])
    for _, row in sampled.iterrows()
]


In [24]:
def precision_at_k(preds, truths, k=5):
    return sum(1 for p in preds[:k] if p in truths) / k

def reciprocal_rank(preds, truths):
    for i, p in enumerate(preds, start=1):
        if p in truths:
            return 1.0 / i
    return 0.0

rows = []
for query, truths in testSet:
    for name in modelNames:
        dfRes = search_books_faiss(query, modelName=name, topK=5)
        preds = dfRes['book_name'].tolist()
        p5 = precision_at_k(preds, truths, k=5)
        rr = reciprocal_rank(preds, truths)
        rows.append({
            "Query": query,
            "Model": name,
            "Precision@5": p5,
            "MRR": rr
        })

evalDf = pd.DataFrame(rows)
print("Evaluation:")
display(evalDf)
print("\nAverage metrics by model:")
print(evalDf.groupby("Model")[["Precision@5","MRR"]].mean())


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluation:


Unnamed: 0,Query,Model,Precision@5,MRR
0,Make It Stick,all-MiniLM-L6-v2,0.0,0.0
1,Make It Stick,paraphrase-MiniLM-L6-v2,0.0,0.0
2,Make It Stick,all-mpnet-base-v2,0.4,1.0
3,Why Are We Yelling?,all-MiniLM-L6-v2,1.0,1.0
4,Why Are We Yelling?,paraphrase-MiniLM-L6-v2,1.0,1.0
...,...,...,...,...
145,Meditations On First Philosophy,paraphrase-MiniLM-L6-v2,0.4,1.0
146,Meditations On First Philosophy,all-mpnet-base-v2,0.4,1.0
147,High Performance Habits,all-MiniLM-L6-v2,0.8,1.0
148,High Performance Habits,paraphrase-MiniLM-L6-v2,0.8,1.0



Average metrics by model:
                         Precision@5       MRR
Model                                         
all-MiniLM-L6-v2               0.488  0.798667
all-mpnet-base-v2              0.524  0.900000
paraphrase-MiniLM-L6-v2        0.516  0.853333
