In [1]:
from langchain_core.documents import Document
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import lark
import getpass
import os
import warnings

# Disabling warnings:
warnings.filterwarnings("ignore")



In [2]:
docs = [
    Document(
        page_content="A tale about a young wizard and his journey in a magical school.",
        metadata={
            "title": "Harry Potter and the Philosopher's Stone",
            "author": "J.K. Rowling",
            "year_published": 1997,
            "genre": "Fiction",
            "isbn": "978-0747532699",
            "publisher": "Bloomsbury",
            "language": "English",
            "page_count": 223,
            "summary": "The first book in the Harry Potter series where Harry discovers his magical heritage.",
            "rating": 4.8,
        },
    ),
    Document(
        page_content="An epic tale of power, betrayal and love set in a fantastical world.",
        metadata={
            "title": "A Game of Thrones",
            "author": "George R.R. Martin",
            "year_published": 1996,
            "genre": "Fantasy",
            "isbn": "978-0553103540",
            "publisher": "Bantam",
            "language": "English",
            "page_count": 694,
            "summary": "The first book in A Song of Ice and Fire series, introducing the intricate world of Westeros.",
            "rating": 4.6,
        },
    ),
    Document(
        page_content="A futuristic society where firemen burn books to maintain order.",
        metadata={
            "title": "Fahrenheit 451",
            "author": "Ray Bradbury",
            "year_published": 1953,
            "genre": "Science Fiction",
            "isbn": "978-1451673319",
            "publisher": "Simon & Schuster",
            "language": "English",
            "page_count": 249,
            "summary": "In a future society, books are banned and firemen are tasked to burn any they find, leading one fireman to question his role.",
            "rating": 4.4,
        },
    ),
    Document(
        page_content="A young woman's life in the South during the Civil War and Reconstruction.",
        metadata={
            "title": "Gone with the Wind",
            "author": "Margaret Mitchell",
            "year_published": 1936,
            "genre": "Historical Fiction",
            "isbn": "978-0684830681",
            "publisher": "Macmillan",
            "language": "English",
            "page_count": 1037,
            "summary": "The tale of Scarlett O'Hara and her love affair with Rhett Butler, set against the backdrop of the American Civil War.",
            "rating": 4.3,
        },
    ),
    Document(
        page_content="A story about a hobbit's journey to destroy a powerful ring.",
        metadata={
            "title": "The Lord of the Rings",
            "author": "J.R.R. Tolkien",
            "year_published": 1954,
            "genre": "Fantasy",
            "isbn": "978-0618640157",
            "publisher": "Houghton Mifflin",
            "language": "English",
            "page_count": 1216,
            "summary": "The epic tale of Frodo Baggins and his quest to destroy the One Ring, accompanied by a group of diverse companions.",
            "rating": 4.7,
        },
    ),
]

In [3]:
from langchain_openai.chat_models import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [5]:
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

True

In [6]:
# Create the embeddings and vectorstore:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

In [7]:
# Basic Info
basic_info = [
    AttributeInfo(name="title", description="The title of the book", type="string"),
    AttributeInfo(name="author", description="The author of the book", type="string"),
    AttributeInfo(
        name="year_published",
        description="The year the book was published",
        type="integer",
    ),
]

# Detailed Info
detailed_info = [
    AttributeInfo(
        name="genre", description="The genre of the book", type="string or list[string]"
    ),
    AttributeInfo(
        name="isbn",
        description="The International Standard Book Number for the book",
        type="string",
    ),
    AttributeInfo(
        name="publisher",
        description="The publishing house that published the book",
        type="string",
    ),
    AttributeInfo(
        name="language",
        description="The primary language the book is written in",
        type="string",
    ),
    AttributeInfo(
        name="page_count", description="Number of pages in the book", type="integer"
    ),
]

# Analysis
analysis = [
    AttributeInfo(
        name="summary",
        description="A brief summary or description of the book",
        type="string",
    ),
    AttributeInfo(
        name="rating",
        description="An average rating for the book (from reviews), ranging from 1-5",
        type="float",
    ),
]

# Combining all lists into metadata_field_info
metadata_field_info = basic_info + detailed_info + analysis

In [8]:
document_content_description = "Brief summary of a movie"
llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info
)

In [10]:
# Looking for sci-fi books
documents = retriever.invoke("What are some sci-fi books?")

# De-duplicate:
deduped_documents = [
    dict(t)
    for t in {tuple(d.metadata.items()) for d in documents}
]

print(deduped_documents)

[{'author': 'J.R.R. Tolkien', 'genre': 'Fantasy', 'isbn': '978-0618640157', 'language': 'English', 'page_count': 1216, 'publisher': 'Houghton Mifflin', 'rating': 4.7, 'summary': 'The epic tale of Frodo Baggins and his quest to destroy the One Ring, accompanied by a group of diverse companions.', 'title': 'The Lord of the Rings', 'year_published': 1954}, {'author': 'Ray Bradbury', 'genre': 'Science Fiction', 'isbn': '978-1451673319', 'language': 'English', 'page_count': 249, 'publisher': 'Simon & Schuster', 'rating': 4.4, 'summary': 'In a future society, books are banned and firemen are tasked to burn any they find, leading one fireman to question his role.', 'title': 'Fahrenheit 451', 'year_published': 1953}, {'author': 'J.K. Rowling', 'genre': 'Fiction', 'isbn': '978-0747532699', 'language': 'English', 'page_count': 223, 'publisher': 'Bloomsbury', 'rating': 4.8, 'summary': 'The first book in the Harry Potter series where Harry discovers his magical heritage.', 'title': "Harry Potter a

In [11]:
# Querying for a book by J.K. Rowling:
retriever.invoke("I want some books that are published by the author J.K. Rowling for Harry Potter.")[0]

Document(page_content='A tale about a young wizard and his journey in a magical school.', metadata={'author': 'J.K. Rowling', 'genre': 'Fiction', 'isbn': '978-0747532699', 'language': 'English', 'page_count': 223, 'publisher': 'Bloomsbury', 'rating': 4.8, 'summary': 'The first book in the Harry Potter series where Harry discovers his magical heritage.', 'title': "Harry Potter and the Philosopher's Stone", 'year_published': 1997})

In [12]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
)

retriever.invoke("Return 1 Fantasy book")

[Document(page_content='An epic tale of power, betrayal and love set in a fantastical world.', metadata={'author': 'George R.R. Martin', 'genre': 'Fantasy', 'isbn': '978-0553103540', 'language': 'English', 'page_count': 694, 'publisher': 'Bantam', 'rating': 4.6, 'summary': 'The first book in A Song of Ice and Fire series, introducing the intricate world of Westeros.', 'title': 'A Game of Thrones', 'year_published': 1996})]