# Load Required Libraries

In [1]:
# For loading plain text documents
from langchain_community.document_loaders import TextLoader

# For splitting large text into smaller chunks (used for embedding)
from langchain_text_splitters import CharacterTextSplitter

# For generating text embeddings using OpenAI
from langchain_openai import OpenAIEmbeddings

# For storing and retrieving embeddings from a vector store/database
from langchain_chroma import Chroma

# For loading environment variables (API keys)
from dotenv import load_dotenv

# For working with data
import pandas as pd


# Load OpenAI API Key from .env

In [2]:
# Load OpenAI API key and any other secrets from the .env file
load_dotenv()

True

# Load Cleaned Book Dataset

In [3]:
# Load the cleaned book data (output from data wrangling)
books = pd.read_csv("books_cleaned.csv")

# Preview the dataset
books


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [4]:
books['tagged_description']

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

# Export Tagged Book Descriptions to Text File

In [5]:
# Save tagged descriptions (isbn13 + description) to a plain text file
# Each line represents a unique book description for embedding
books['tagged_description'].to_csv(
    "tagged_descriptions.txt",
    sep="\n",
    index=False,
    header=False
)


# Load and Split Text into Chunks

In [8]:
# Load the text file using LangChain's TextLoader
raw_documents = TextLoader("tagged_descriptions.txt").load()

# Instantiate a text splitter
# Each chunk will have up to 1000 characters with no overlap
text_splitter = CharacterTextSplitter(
    chunk_size=0,        # Adjust as needed
    chunk_overlap=0,        # No overlapping chunks
    separator="\n"          # Split at newlines
)

# Split the loaded text into manageable chunks
documents = text_splitter.split_documents(raw_documents)





Created a chunk of size 1168, which is longer than the specified 0
Created a chunk of size 1214, which is longer than the specified 0
Created a chunk of size 373, which is longer than the specified 0
Created a chunk of size 309, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a chunk of size 482, which is longer than the specified 0
Created a chunk of size 960, which is longer than the specified 0
Created a chunk of size 188, which is longer than the specified 0
Created a chunk of size 843, which is longer than the specified 0
Created a chunk of size 296, which is longer than the specified 0
Created a chunk of size 197, which is longer than the specified 0
Created a chunk of size 881, which is longer than the specified 0
Created a chunk of size 1088, which is longer than the specified 0
Created a chunk of size 1189, which is longer than the specified 0
Created a chunk of size 304, which is longer than the specified 0
Create

In [16]:
documents[0]

Document(metadata={'source': 'tagged_descriptions.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, G

# create the document embedding and store them in vector database

# Create Document Embeddings and Store in Vector Database


In [28]:
# Create OpenAI embedding model
embedding_model = OpenAIEmbeddings()

# Convert the text chunks into embeddings and store them in a vector database (Chroma)
# This enables fast similarity search
db_books = Chroma.from_documents(documents, embedding=embedding_model)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


# Test Query with Semantic Search

In [33]:
# Test a semantic query to find similar book descriptions
query = 'A book to teach children about nature'

# Retrieve top 10 most semantically similar text chunks
docs = db_books.similarity_search(query, k=10)

# Display the top result (raw chunk of text: includes ISBN and description)
docs[0]


Document(id='87eb5d6f-ddd9-4f58-b41e-247cf87ed2ac', metadata={'source': 'tagged_descriptions.txt'}, page_content='9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.')

# Map Back to Original Book Using ISBN


This is important because users want book titles, not raw descriptions, so you map the embedding result back to your original structured data.

In [34]:

# Extract ISBN from the beginning of the chunk (first word in the content)
isbn_result = docs[0].page_content.split()[0].strip()

# Convert to integer and match with original book data
books[books['isbn13'] == int(isbn_result)]


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [None]:
# Generalized Retrieval Function for Semantic Recommendations

# Generalized Retrieval Function for Semantic Recommendations


In [35]:
def retrieve_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    """
    Given a natural language query, retrieve top_k semantically similar books
    based on their description embeddings stored in Chroma.

    Args:
        query (str): User's search intent (e.g., "Books about ancient civilizations")
        top_k (int): Number of recommendations to return

    Returns:
        pd.DataFrame: Top k recommended books from the original dataset
    """
    # Perform semantic similarity search on the embedded documents
    recs = db_books.similarity_search(query, k=top_k * 5)  # Fetch more to avoid duplicates

    # Extract the ISBNs from the top-matching documents
    isbn_matches = []
    for doc in recs:
        try:
            isbn = int(doc.page_content.split()[0].strip())
            isbn_matches.append(isbn)
        except ValueError:
            continue  # Skip if the text chunk is malformed

    # Return top_k unique books from the original dataset
    return books[books['isbn13'].isin(isbn_matches)].drop_duplicates('isbn13').head(top_k)



In [37]:
retrieve_semantic_recommendation("Books about personal development for teenagers")



Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
74,9780007195718,7195710,Discover Your Destiny with the Monk Who Sold H...,Robin Sharma,Conduct of life,http://books.google.com/books/content?id=4hVbN...,A potent pathway to self-awakening that will h...,2004.0,3.9,240.0,1956.0,Discover Your Destiny with the Monk Who Sold H...,9780007195718 A potent pathway to self-awakeni...
266,9780060880125,60880120,How to Be Popular,Meg Cabot,Juvenile Fiction,http://books.google.com/books/content?id=uRsWy...,Sixteen-year-old Steph Landry finds an old boo...,2006.0,3.54,288.0,19183.0,How to Be Popular,9780060880125 Sixteen-year-old Steph Landry fi...
593,9780140196092,140196099,Who are You?,Malcolm Godwin,"Body, Mind & Spirit",http://books.google.com/books/content?id=2p8BA...,A noted artist introduces 101 easy-to-follow w...,2000.0,3.48,224.0,250.0,Who are You? 101 Ways of Seeing Yourself,9780140196092 A noted artist introduces 101 ea...
872,9780143039853,143039857,The Outsiders,S. E. Hinton;Jodi Picoult,Fiction,http://books.google.com/books/content?id=9PyIP...,The struggle of three brothers to stay togethe...,1967.0,4.08,160.0,1558.0,The Outsiders,9780143039853 The struggle of three brothers t...
1595,9780370307121,370307127,A Likely Lad,Gillian Avery,Adventure stories,,Pressured by his father to leave school for a ...,1986.0,4.08,224.0,12.0,A Likely Lad,9780370307121 Pressured by his father to leave...
1652,9780374528539,374528535,"Get Out of My Life, But First Could You Drive ...",Anthony E. Wolf,Family & Relationships,http://books.google.com/books/content?id=oKdbC...,A lighthearted but insightful guide to raising...,2002.0,3.94,240.0,1301.0,"Get Out of My Life, But First Could You Drive ...",9780374528539 A lighthearted but insightful gu...
1746,9780375829635,375829636,Young Warriors,Tamora Pierce;Josepha Sherman,Juvenile Fiction,http://books.google.com/books/content?id=6EnZw...,Fifteen original short stories by various auth...,2005.0,3.91,312.0,3583.0,Young Warriors Stories of Strength,9780375829635 Fifteen original short stories b...
1844,9780385481960,385481969,Youth in Revolt,C. D. Payne,Fiction,http://books.google.com/books/content?id=cZKdO...,"Nick Twisp, a dark and precocious fourteen-yea...",1996.0,4.02,512.0,8158.0,Youth in Revolt The Journals of Nick Twisp,"9780385481960 Nick Twisp, a dark and precociou..."
1888,9780393040586,393040585,The Confident Child,T. E. Apter,Family & Relationships,http://books.google.com/books/content?id=stxpQ...,"Focusing on children from age five to fifteen,...",1997.0,3.62,270.0,13.0,"The Confident Child Raising a Child to Try, Le...",9780393040586 Focusing on children from age fi...
1934,9780393703351,393703355,Living on the Razor's Edge,Matthew D. Selekman,Psychology,http://books.google.com/books/content?id=hTe40...,"Employing a multifaceted, flexible approach ba...",2002.0,4.0,223.0,4.0,Living on the Razor's Edge Solution-oriented B...,"9780393703351 Employing a multifaceted, flexib..."
