In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from datasets import load_dataset

import pprint

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, GenerationConfig

In [None]:
# Test Loading Dataset with meta data
dataset = load_dataset("IsmaelMousa/books", split="train")
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(dataset[0])

In [3]:
#setup the data to be embedded
texts_to_embed = [
    f"Title: {row['title']}\nAuthor: {row['author']}\nGenre: {row['category']}\nText: {row['EN']}"
    for row in dataset
]

# Generate PK for texts
texts_ids = [str(uuid4())[:8] for _ in range(len(texts_to_embed))]

In [8]:
# print one of the text to embed to check if its correct
print(texts_to_embed[0])


Title: Robinson Crusoe
Author: Daniel Defoe
Genre: Adventure
Text: CHAPTER I. START IN LIFE


I was born in the year 1632, in the city of York, of a good family,
though not of that country, my father being a foreigner of Bremen, who
settled first at Hull. He got a good estate by merchandise, and leaving
off his trade, lived afterwards at York, from whence he had married my
mother, whose relations were named Robinson, a very good family in that
country, and from whom I was called Robinson Kreutznaer; but, by the
usual corruption of words in England, we are now called nay we call
ourselves and write our name Crusoe; and so my companions always called
me.

I had two elder brothers, one of whom was lieutenant-colonel to an
English regiment of foot in Flanders, formerly commanded by the famous
Colonel Lockhart, and was killed at the battle near Dunkirk against the
Spaniards. What became of my second brother I never knew, any more than
my father or mother knew what became of me.

Being the t

In [4]:
#import the embedding model
embed_model_name = "all-MiniLM-L6-v2"
embed_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

In [5]:
collection_name = 'books'

client = chromadb.Client()

try:
    # Clean up collection
    client.delete_collection(name=collection_name)
except Exception as e:
    pass


In [6]:
collection = client.create_collection(
    name=collection_name,
    embedding_function=embed_model,
)

# If the document <=0, than we load
if collection.count() == 0:
    print("Inserting chunks document into Chroma collection...")
    collection.add(
        documents=texts_to_embed,
        ids=texts_ids,
        metadatas=[dict(dataset[i]) for i in range(len(dataset))]
    )

print(f"Number of documents in collection '{collection_name}': {collection.count()}")

Inserting chunks document into Chroma collection...
Number of documents in collection 'books': 40


In [7]:
# Setting up of the LLM Model 
# We Are using google/flan-t5-small
model_name = "google/flan-t5-small"

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [26]:
query = "Recommend a book in the Biography genre"

results = collection.query(
    query_texts=[query],
    n_results=5
)

In [27]:
for i, doc in enumerate(results["documents"][0]):
    meta = results["metadatas"][0][i]
    print(f"Title: {meta['title']} (Author: {meta['author']})")
    print(f"Genre: {meta['category']}")
    #print(f"Text: {meta['EN']}")
    print()

Title: A Princess of Mars (Author: Edgar Rice Burroughs)
Genre: Science Fiction

Title: The Life of Julius Caesar (Author: Herman Melville)
Genre: Biographies

Title: Robinson Crusoe (Author: Daniel Defoe)
Genre: Adventure

Title: Pride and Prejudice (Author: Jane Austen)
Genre: Historical Fiction

Title: The Mystery of the Yellow Room (Author: Gaston Leroux)
Genre: Mystery

