In [28]:
import os

import pandas as pd
from dotenv import load_dotenv
books = pd.read_csv("data_test/books_cleaned.csv")

load_dotenv()
os.environ["GOOGLE_API_KEY"] = "AIzaSyDlRO4062KgN98t1zx1hpZp2QxO0jj1htE"

In [29]:
from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader,CSVLoader,JSONLoader

In [30]:
loader=DirectoryLoader("data_test",glob="./*.txt",loader_cls=lambda path: TextLoader(path,encoding='utf-8')) #document loader

In [31]:
document=loader.load()

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
text=text_splitter.split_documents(document)

In [33]:
len(text)

7357

# CREATING VECTOR-DATABASE

In [34]:
persist_directory="db_books"

In [35]:
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [108]:
vectordb=Chroma.from_documents(
    documents=text,
    embedding=embedding,
    persist_directory=persist_directory
)

In [109]:
vectordb.persist()

In [110]:
vectordb=None

In [111]:
vectordb=Chroma(persist_directory=persist_directory,embedding_function=embedding)

In [112]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x25da9cf2690>

# MAKING QUERIES

In [113]:
retriver=vectordb.as_retriever()

In [114]:
query="A book to teach children about nature"
docs=retriver.get_relevant_documents(query,k=3)

In [115]:
docs

[Document(metadata={'source': 'data_test\\tagged_description.txt'}, page_content='9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(metadata={'source': 'data_test\\tagged_description.txt'}, page_content='9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(metadata={'source': 'data_test\\tagged_description.txt'}, page_content="and a guide to capturing the simple power of discovery that Carson views as essential to life. In her insightful new introduction, Linda Lear remembers Rachel Carson's groundbreaking achieveme

In [117]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_values,missing_decription,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,0,0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [118]:
def retrieve_semantic_recommendation(
        query: str,
        top_k: int = 10,
) -> pd.DataFrame:
    recs = retriver.get_relevant_documents(query, k=50)
    books_list = []

    for i in range(0, len(recs)):
        try:
            # Ensure that there is content to parse
            content = recs[i].page_content.strip('"')
            if content:
                isbn = content.split()[0]  # Attempt to get the first word as ISBN
                # Optionally, you can validate the ISBN format here if needed
                books_list.append(int(isbn))  # Only add to list if it's a valid ISBN
        except Exception as e:
            print(f"Error processing document {i}: {e}")

    # Filter the books by ISBN and return
    return books[books["isbn13"].isin(books_list)]


In [119]:
retrieve_semantic_recommendation("A book to teach children about nature")

Error processing document 2: invalid literal for int() with base 10: 'and'
Error processing document 3: invalid literal for int() with base 10: 'and'
Error processing document 6: invalid literal for int() with base 10: 'in'
Error processing document 7: invalid literal for int() with base 10: 'in'
Error processing document 8: invalid literal for int() with base 10: 'wildlife,'
Error processing document 9: invalid literal for int() with base 10: 'wildlife,'
Error processing document 14: invalid literal for int() with base 10: 'two'
Error processing document 15: invalid literal for int() with base 10: 'two'
Error processing document 20: invalid literal for int() with base 10: 'floating'
Error processing document 21: invalid literal for int() with base 10: 'floating'
Error processing document 40: invalid literal for int() with base 10: 'needs'
Error processing document 41: invalid literal for int() with base 10: 'needs'


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_values,missing_decription,title_and_subtitle,tagged_description
228,9780060782139,0060782137,Time For Kids: Butterflies!,Editors of TIME For Kids,Juvenile Nonfiction,http://books.google.com/books/content?id=OdZxn...,"Butterflies There are 20,000 different kinds o...",2006.0,4.0,32.0,20.0,0,0,Time For Kids: Butterflies!,"9780060782139 Butterflies There are 20,000 dif..."
429,9780064434980,0064434982,The Deer in the Wood,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=V7YDW...,Even the youngest child can enjoy a special ad...,1999.0,4.17,32.0,302.0,0,0,The Deer in the Wood,9780064434980 Even the youngest child can enjo...
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,0,0,The Sense of Wonder,9780067575208 First published more than three ...
1077,9780240806082,0240806085,Directing the Documentary,Michael Rabiger,Performing Arts,http://books.google.com/books/content?id=uoKli...,Michael Rabiger guides the reader through the ...,2004.0,4.23,648.0,173.0,0,0,Directing the Documentary,9780240806082 Michael Rabiger guides the reade...
1907,9780393315110,0393315118,Uncommon Ground: Rethinking the Human Place in...,William Cronon,Law,http://books.google.com/books/content?id=w04mj...,"Essays by revisionist historians, scientists, ...",1996.0,4.16,560.0,649.0,0,0,Uncommon Ground: Rethinking the Human Place in...,9780393315110 Essays by revisionist historians...
2112,9780439405577,0439405572,Don't Know Much about American History,Kenneth C. Davis,United States,,"Presents, in question and answer format, a his...",2003.0,3.9,222.0,21169.0,0,0,Don't Know Much about American History,"9780439405577 Presents, in question and answer..."
2503,9780486276809,0486276805,The Secret Garden Coloring Book,Frances Hodgson Burnett,Juvenile Nonfiction,http://books.google.com/books/content?id=sxHdA...,Ten-year-old Mary comes to live in a lonely ho...,2014.0,4.25,48.0,4.0,0,0,The Secret Garden Coloring Book,9780486276809 Ten-year-old Mary comes to live ...
3198,9780689716041,0689716044,The Bears on Hemlock Mountain,Alice Dalgliesh,Juvenile Fiction,http://books.google.com/books/content?id=2gt4h...,An eight-year-old boy who is frightened of tra...,1992.0,3.78,64.0,2017.0,0,0,The Bears on Hemlock Mountain,9780689716041 An eight-year-old boy who is fri...
3201,9780689823824,0689823827,A Child's Garden of Verses,Robert Louis Stevenson,Juvenile Fiction,http://books.google.com/books/content?id=luTZA...,"Here is a delightful look at childhood, writte...",1999.0,4.3,67.0,21780.0,0,0,A Child's Garden of Verses,9780689823824 Here is a delightful look at chi...
3203,9780689834196,0689834195,Salt in His Shoes,Deloris Jordan;Roslyn M. Jordan,Juvenile Nonfiction,http://books.google.com/books/content?id=hYzQw...,"Young Michael Jordan, who is smaller than the ...",2003.0,4.21,32.0,1054.0,0,0,Salt in His Shoes: Michael Jordan in Pursuit o...,"9780689834196 Young Michael Jordan, who is sma..."
