In [None]:
import os

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_chroma import Chroma

import warnings


warnings.filterwarnings('ignore')

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd
books = pd.read_csv('../data/books_cleaned.csv')

books.head(10)

In [None]:
books[['title', 'tagged_description']].head(5)

In [None]:
books['tagged_description'].to_csv(
    '../data/tagged_descriptions.txt',
    index=False,
    header=False,
    lineterminator='\n'
)

In [None]:
raw_documents = TextLoader(
    '../data/tagged_descriptions.txt',
    encoding='utf-8'
).load()

text_splitter = CharacterTextSplitter(
    chunk_size=1,            # must be >0
    chunk_overlap=0,
    separator="\n",
    keep_separator=False
)

documents = text_splitter.split_documents(raw_documents)


In [None]:
documents[0]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain.embeddings.base import Embeddings
from langchain_text_splitters import CharacterTextSplitter

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
device = "cpu"
model.to(device)
model.eval()

def create_embeddings(texts, batch_size=32):
    all_embeds = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        with torch.no_grad():
            tokens = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors="pt"
            ).to(device)
            outputs = model(**tokens)
            token_embeddings = outputs.last_hidden_state
            attention_mask = tokens["attention_mask"]
            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            embeddings = (token_embeddings * mask_expanded).sum(1) / mask_expanded.sum(1)
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            all_embeds.extend(embeddings.cpu().tolist())
    return all_embeds

class SimpleEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return create_embeddings(texts, batch_size=32)
    def embed_query(self, text):
        return create_embeddings([text], batch_size=1)[0]

# Load already line-separated descriptions
raw_documents = TextLoader(
    '../data/tagged_descriptions.txt',
    encoding='utf-8'
).load()

# Optional: keep each line as one document (large chunk size prevents splitting)
text_splitter = CharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=0,
    separator="\n",
    keep_separator=False
)
documents = text_splitter.split_documents(raw_documents)

emb = SimpleEmbeddings()
db_books = Chroma.from_documents(
    documents,
    embedding=emb,
    persist_directory="./chroma_db_cpu"
)

In [None]:
# Search for similar documents
query = "A book to teach children about nature"
results = db_books.similarity_search(query, k=3)

# Print the top 3 results
for result in results:
    print(result.page_content[:200])  # Print first 200 characters
    print("---")

In [None]:
import pandas as pd

books = pd.read_csv('../data/books_cleaned.csv')

In [None]:
matched = books.loc[
    pd.to_numeric(books['isbn13'], errors='coerce') ==
    int(''.join(ch for ch in results[0].page_content.split()[0].strip(' "\'') if ch.isdigit()))
]

matched

In [None]:
def retrieve_recommendation(
        query:str,
        top_k: int = 10
) -> pd.DataFrame:
    results = db_books.similarity_search(query, k=50)

    books_list = []

    for i in range(0, len(results)):
        books_list += [int(results[i].page_content.strip('""').split()[0])]

    return books[books['isbn13'].isin(books_list)].head(top_k)


In [57]:
retrieve_recommendation("a picture book about animals for kids")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
404,9780064402453,0064402452,Racso and the Rats of NIMH,Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=MgoNv...,"‘Racso, a brash and boastful little rodent, is...",1988.0,3.76,288.0,3231.0,Racso and the Rats of NIMH,"9780064402453 ‘Racso, a brash and boastful lit..."
406,9780064403870,0064403874,"R-T, Margaret, and the Rats of NIMH",Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=WTHHH...,"When Margaret and her younger brother, Artie, ...",1991.0,3.52,272.0,631.0,"R-T, Margaret, and the Rats of NIMH",9780064403870 When Margaret and her younger br...
407,9780064404419,0064404412,The Rainbow People,Laurence Yep,Juvenile Fiction,http://books.google.com/books/content?id=5AHwq...,"""Culled from 69 stories collected in a [1930s]...",1992.0,3.75,208.0,202.0,The Rainbow People,"9780064404419 ""Culled from 69 stories collecte..."
416,9780064406925,006440692X,Winter on the Farm,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=IvlKH...,The Little House books tell the story of a lit...,1997.0,4.13,32.0,400.0,Winter on the Farm,9780064406925 The Little House books tell the ...
427,9780064434874,0064434877,Christmas in the Big Woods,Laura Ingalls Wilder,Juvenile Fiction,http://books.google.com/books/content?id=FT1Yp...,"Long ago, a little girl named Laura Ingalls li...",1997.0,4.19,32.0,2062.0,Christmas in the Big Woods,"9780064434874 Long ago, a little girl named La..."
432,9780064440950,0064440958,Chester,Syd Hoff,Juvenile Fiction,http://books.google.com/books/content?id=DiGFB...,"Chester, a wild horse who wants to be tame, co...",1986.0,3.75,64.0,187.0,Chester,"9780064440950 Chester, a wild horse who wants ..."
812,9780142302279,0142302279,Dirty Beasts,Roald Dahl,Juvenile Nonfiction,,Poems tell the stories of a smart pig who outw...,2002.0,4.02,32.0,3953.0,Dirty Beasts,9780142302279 Poems tell the stories of a smar...
1078,9780241003008,0241003008,The Very Hungry Caterpillar,Eric Carle,Babytime resource,http://books.google.com/books/content?id=DpGEQ...,Eric Carle's children's classic is the story o...,1994.0,4.29,26.0,340101.0,The Very Hungry Caterpillar,9780241003008 Eric Carle's children's classic ...
1222,9780312319731,0312319738,Dominion,Matthew Scully,Nature,http://books.google.com/books/content?id=_htG-...,Argues for responsible action in the treatment...,2003.0,4.16,448.0,1131.0,"Dominion: The Power of Man, the Suffering of A...",9780312319731 Argues for responsible action in...
1228,9780312330866,0312330863,All Things Bright and Beautiful,James Herriot,Biography & Autobiography,http://books.google.com/books/content?id=skqN2...,A Yorkshire veterinarian describes the adventu...,2004.0,4.31,378.0,60280.0,All Things Bright and Beautiful,9780312330866 A Yorkshire veterinarian describ...
