In [1]:
import sys
sys.path.append("../../")

import random

from langchain.chains.query_constructor.base import (
    AttributeInfo,
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.retrievers.self_query.chroma import ChromaTranslator

from self_query_rag.utils.utils import build_path, load_config_yaml
from self_query_rag.dataloader.dataloader import DataLoader

In [2]:
DIR_NAME = 'config'
FILE_NAME = 'config.yaml'

config_path = build_path(DIR_NAME, FILE_NAME)
config = load_config_yaml(config_path)

dataloader = DataLoader(config)
data = dataloader.transform()

data.head()

Unnamed: 0,year,title,director,cast,genre,wiki_page,description,country,id
0,1901,Kansas Saloon Smashers,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",United States,0
1,1901,Love by the Light of the Moon,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",United States,1
2,1901,The Martyred Presidents,unknown,unknown,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",United States,2
3,1901,"Terrible Teddy, the Grizzly King",unknown,unknown,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,United States,3
4,1902,Jack and the Beanstalk,"George S. Fleming, Edwin S. Porter",unknown,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,United States,4


In [3]:
data

Unnamed: 0,year,title,director,cast,genre,wiki_page,description,country,id
0,1901,Kansas Saloon Smashers,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",United States,0
1,1901,Love by the Light of the Moon,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",United States,1
2,1901,The Martyred Presidents,unknown,unknown,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",United States,2
3,1901,"Terrible Teddy, the Grizzly King",unknown,unknown,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,United States,3
4,1902,Jack and the Beanstalk,"George S. Fleming, Edwin S. Porter",unknown,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,United States,4
...,...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ...",Turkey,34881
34882,2017,Çalgı Çengi İkimiz,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the...",Turkey,34882
34883,2017,Olanlar Oldu,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i...",Turkey,34883
34884,2017,Non-Transferable,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...,Turkey,34884


In [4]:
metadata_field_info = [
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="title",
        description="The title of the movie",
        type="string",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director. It can be multiple names",
        type="string",
    ),
    AttributeInfo(
        name="cast",
        description="The name of the actors in the movie",
        type="string",
    ),
    AttributeInfo(
        name="genre",
        description="The genre of the movie. It can be multiple genres",
        type="string",
    ),
    AttributeInfo(
        name="description",
        description="Long form description of movie plot, i.e. the sequence of most important events in the movie",
        type="string",
    ),
    AttributeInfo(
        name="country",
        description="The cpuntry where the movie was produced",
        type="string",
    ),
]

In [5]:
METADATA_COLUMNS = ['year', 'title', 'cast', 'genre', 'country']

documents = []
for d in data.to_dict(orient="records"):
    documents.append(
        Document(
            page_content = d["description"],
            metadata = {col: d[col] for col in METADATA_COLUMNS},
        )
    )
    
len(documents)

34886

In [6]:
documents_subset = random.sample(documents, 1000)
len(documents_subset)

1000

In [7]:
vectorstore = Chroma.from_documents(documents_subset, OpenAIEmbeddings())

In [10]:
document_content_description = "Brief summary of a movie"

llm = ChatOpenAI(temperature=0)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)

output_parser = StructuredQueryOutputParser.from_components()

In [11]:
query_constructor = prompt | llm | output_parser

output_format = query_constructor.invoke(
    {
        "query": "What are some sci-fi movies from the 90's directed by Luc Besson about taxi drivers"
    }
)

output_format

StructuredQuery(query='taxi driver', filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='sci-fi'), Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='year', value=1990), Comparison(comparator=<Comparator.LT: 'lt'>, attribute='year', value=2000)]), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Luc Besson')]), limit=None)

In [12]:
retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=ChromaTranslator(),
)

In [13]:
retriever.invoke(
    "What are some sci-fi movies from the 90's directed by Luc Besson"
)

OutputParserException: Parsing text
```json
{
    "query": "sci-fi",
    "filter": "and(eq(\"genre\", \"sci-fi\"), and(gte(\"year\", 1990), lt(\"year\", 2000), eq(\"director\", \"Luc Besson\"))"
}
```
 raised following error:
Unexpected token Token('$END', '') at line 1, column 97.
Expected one of: 
	* RPAR
	* COMMA
