In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install opendatasets --upgrade

import opendatasets as od

dataset_url = 'https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata?select=books.csv'
od.download(dataset_url)

In [None]:
!pip install pandas matplotlib seaborn python-dotenv \
langchain-community langchain-openai langchain-chroma \
transformers gradio ipywidgets


In [None]:
# Data manipulation & visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Environment variable handling
from dotenv import load_dotenv

# LangChain modules
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# Transformers (e.g., for Hugging Face models)
from transformers import pipeline

# UI & widgets
import gradio as gr
import ipywidgets as widgets


In [None]:
df = pd.read_csv('/content/7k-books-with-metadata/books.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [None]:
df.columns

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
isbn13,0
isbn10,0
title,0
subtitle,4429
authors,72
categories,99
thumbnail,329
description,262
published_year,6
average_rating,43


In [None]:
sns.heatmap(df.isna().transpose(), cmap="YlGnBu")
plt.xlabel("columns")
plt.ylabel("missing values")
plt.show()

In [None]:
import numpy as np

#books where desc missing put 1 and not missing as 0
df["missing_description"] = np.where(df["description"].isna(), 1,0)

#convert year of the book to age of the book
df["age_of_book"] = 2025-df["published_year"]

In [None]:
df.columns

In [None]:
col_of_interest = ["num_pages","age_of_book","missing_description","average_rating"]

correlation_matix = df[col_of_interest].corr(method= "spearman")

sns.set_theme(style= "white")
plt.figure(figsize= (8,6))
heatmap = sns.heatmap(correlation_matix, annot= True, cmap= "coolwarm", cbar_kws={"label":"Spearman correlation"})
heatmap.set_title("correaltion heatmap")
plt.show()

In [None]:
df[(df["description"].isnull())|
   (df["num_pages"].isnull())|
   (df["average_rating"].isnull())|
   (df["published_year"].isnull())]


In [None]:
df_After_rem_miss_val = df[~(df["description"].isnull()) &
                ~(df["num_pages"].isnull()) &
                ~(df["average_rating"].isnull()) &
                ~(df["published_year"].isnull())].reset_index(drop=True)

In [None]:
df_After_rem_miss_val.isnull().sum()

Unnamed: 0,0
isbn13,0
isbn10,0
title,0
subtitle,4226
authors,63
categories,33
thumbnail,214
description,0
published_year,0
average_rating,0


In [None]:
'''
above two block of code can be achive by below code as well

cols = ["description", "num_pages", "average_rating", "published_year"]
df[df[cols].isnull().any(axis=1)]
This gives you a boolean mask where each row is True if any of the listed columns are missing.

cols = ["description", "num_pages", "average_rating", "published_year"]
df[df[cols].notnull().all(axis=1)]
This gives True only when all columns in that row are present (non-null).
'''


In [None]:
#pd.set_option('display.max_rows', None) show everything don't hold back
# pd.reset_option('display.max_rows') reset
#pd.set_option('display.max_rows', 10)  # Or any number you prefer
a = df_After_rem_miss_val["categories"].value_counts().reset_index().sort_values("count", ascending= False)


In [None]:
a[a['count'] < 10].head(10)

Unnamed: 0,categories,count
46,Science fiction,9
47,Crafts & Hobbies,8
51,Pets,8
50,Young Adult Fiction,8
49,Americans,8
48,"Authors, American",8
57,Life on other planets,7
56,Fantasy,7
58,Short stories,7
52,City and town life,7


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Loop through numeric columns
for col in df_After_rem_miss_val.select_dtypes(include=['int', 'float']).columns:
    fig, axes = plt.subplots(1, 2, figsize=(6,3))  # 1 row, 2 columns

    # Histogram + KDE
    sns.histplot(df_After_rem_miss_val[col], kde=True, bins=30, ax=axes[0])
    axes[0].set_title(f'Distribution of {col}')
    axes[0].set_xlabel(col)
    axes[0].set_ylabel('Frequency')

    # Boxplot
    sns.boxplot(x=df_After_rem_miss_val[col], ax=axes[1])
    axes[1].set_title(f'Boxplot of {col}')
    axes[1].set_xlabel(col)

    plt.tight_layout()
    plt.show()


In [None]:
df_After_rem_miss_val.describe()

Unnamed: 0,isbn13,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book
count,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0
mean,9780668000000.0,1998.966498,3.932683,346.071,21757.27,0.0,26.033502
std,586840400.0,9.837214,0.322427,233.440498,140250.5,0.0,9.837214
min,9780002000000.0,1876.0,0.0,0.0,0.0,0.0,6.0
25%,9780317000000.0,1997.0,3.77,208.0,175.0,0.0,20.0
50%,9780552000000.0,2002.0,3.95,304.0,1081.0,0.0,23.0
75%,9780807000000.0,2005.0,4.13,418.0,6309.5,0.0,28.0
max,9789042000000.0,2019.0,5.0,3342.0,5629932.0,0.0,149.0


In [None]:
dfnew1 =  df_After_rem_miss_val.copy()

In [None]:
dfnew1["new_description"] = dfnew1['description'].str.split().str.len()

In [None]:
b = dfnew1.loc[dfnew1['new_description'] <= 10, ["description", "new_description"]]
b

In [None]:
pd.set_option('display.max_rows', 10)         # Limit to 10 rows
pd.set_option('display.max_colwidth', None)   # Show full text

dfnew1.loc[dfnew1['new_description'].between(25, 34),
           ["description", "new_description"]]


In [None]:
pd.reset_option("all")

In [None]:
dfnew2 = dfnew1[dfnew1["new_description"] >= 25]

In [None]:
missing_df = dfnew2.isnull().sum().to_frame('Missing Values') # to_frame to convert series into DF
missing_df['Percentage (%)'] = (missing_df['Missing Values'] / dfnew2.shape[0]) * 100
missing_df


In [None]:
dfnew25 = dfnew2[dfnew2["new_description"]>= 25]
dfnew25.head()

In [None]:
dfnew25["title_and_subtitle"] = np.where(dfnew25["subtitle"].isnull(), dfnew25["title"],
                                         dfnew25[["title", "subtitle"]].astype(str).agg(": ".join, axis = 1))


In [None]:
dfnew25[["title_and_subtitle", "subtitle", "description", "missing_description", "age_of_book", "new_description"]].head()

In [None]:
#new column its a combination of isnb13 and descrition
dfnew25["tagged_description"] = dfnew25[["isbn13","description"]].astype(str).agg(" ".join, axis = 1)

In [None]:
#dropping unnecessary columns   subtitle missing_description age_of_book new_description

dfnew25.drop(columns = ["subtitle", "missing_description", "age_of_book", "new_description"])\
.to_csv("books_cleaned.csv", index = False)

Vector-search


In [None]:
books = pd.read_csv("books_cleaned.csv")

basiclly langchain doesn't work with pandas dataframe so we convert it with text sepearted by new line

In [None]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep = "\n",
                                   index = False,
                                   header = False)

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Load text data
raw_documents = TextLoader("tagged_description.txt").load()

# Split text into chunks based on newline
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

# Use Hugging Face embeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create Chroma vector store
db_books = Chroma.from_documents(
    documents=documents,
    embedding=embedding
)

In [None]:
query = "A book about world war"
docs = db_books.similarity_search(query, k = 10)
docs

In [None]:
books[books["isbn13"]==int(docs[0].page_content.split()[0].strip())]

In [None]:
def retrieve_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=top_k)

    books_list = []
    for i in range(len(recs)):
        isbn = recs[i].page_content.split()[0].strip().strip('"').strip("'")
        books_list.append(isbn)

    # Ensure isbn13 column is string
    books["isbn13"] = books["isbn13"].astype(str)

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
retrieve_semantic_recommendation("book on world war")