In [1]:
import sys
sys.path.append("../../")

import os
from typing import List, Optional, Union
from dotenv import load_dotenv
import pandas as pd
import random
from langchain_core.documents import Document

import chromadb.utils.embedding_functions as embedding_functions

from self_query_summarization.utils.utils import build_path, load_config_yaml
from self_query_summarization.dataloader.dataloader import DataLoader
from self_query_summarization.retriever.retriever import ChromaRetriever

### Settings

In [192]:
random.seed(42)

load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")

DIR_NAME = 'config'
FILE_NAME = 'config.yaml'

### Input data

In [193]:
config_path = build_path(DIR_NAME, FILE_NAME)
config = load_config_yaml(config_path)

dataloader = DataLoader(config)
data = dataloader.transform()

data.head()

Unnamed: 0,year,title,director,cast,genre,wiki_page,description,country,id
0,1901,Kansas Saloon Smashers,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",United States,0
1,1901,Love by the Light of the Moon,unknown,unknown,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",United States,1
2,1901,The Martyred Presidents,unknown,unknown,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",United States,2
3,1901,"Terrible Teddy, the Grizzly King",unknown,unknown,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,United States,3
4,1902,Jack and the Beanstalk,"George S. Fleming, Edwin S. Porter",unknown,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,United States,4


### Initialize ChromaDB

In [194]:
DOCUMENT_COLUMN = 'description'
METADATA_COLUMNS = ['year', 'title', 'cast', 'genre', 'country']

retriever = ChromaRetriever(
    document_column = DOCUMENT_COLUMN,
    metadata_colums = METADATA_COLUMNS, 
    collection_name = "wiki_movie_plots", 
    embedding_provider="sentence-transformers"
)

Deleting collection wiki_movie_plots
Creating collection wiki_movie_plots


### Load embedded documents 

In [195]:
retriever.upload(data, n_samples=1000)

Generating embeddings for 1000 documents


### 1. Query documents through DB

In [206]:
def show_results(results):
    print('\n'.join([d['title'] for d in results['metadatas'][0]]))

##### 1.1 Query by sentence similarity

In [212]:
query_text = "Four young people vacation on a barren island. One of them, a female artist, has dreams that depict ghastly murders" # The slayer

results = retriever.query(query_text)
show_results(results)

length of query_embeddings 384
The Slayer
Black Moon
The Reincarnation of Peter Proud
Hide and Seek
The Boys


##### 1.2 Query by metadata

In [214]:
query_text = "Four young people vacation on a barren island. One of them, a female artist, has dreams that depict ghastly murders" # The slayer

metadata_filter = {
    "$and": [
        {"year": {"$gt": 1990} },
        {"year": {"$lt": 2000} }
    ]
}

results = retriever.query(query_text, where=metadata_filter)
show_results(results)

length of query_embeddings 384
Ringmaster
Aakhri Cheekh
The Birds II: Land's End
Hideaway
The Midas Touch


##### 1.3 Query by sentence similarity and metadata

In [216]:
query_text = "Four young people vacation on a barren island. One of them, a female artist, has dreams that depict ghastly murders" # The slayer
metadata_filter = {"genre": {"$in": ["horror"]}}
document_filter = {"$contains": "dream"}

results = retriever.query(
    query_text, 
    where = metadata_filter,
    where_document = document_filter
)
show_results(results)

length of query_embeddings 384
The Slayer
The Reincarnation of Peter Proud
Excision


### 2. Query documents through Self-Query

In [227]:
from self_query_summarization.query_constructor.chains import get_results_basic, SelfQueryParser
from langchain.output_parsers import PydanticOutputParser

query_text = "Comic movie from the 60s about young people going to vacation on a barren island. One of them, a female artist, has dreams that depict ghastly murders"

metadata_attributes = """ 
{
    "attributes": {
        "year": {
            "type": "integer",
            "description": "The year the movie was released"
        },
        "title": {
            "type": "string",
            "description": "The title of the movie"
        },
        "genre": {
            "type": "string",
            "description": "The genre of the movie in lowercase. It can contains multiple genres""
        },
        "director": {
            "type": "string",
            "description": "The name of the movie director. It can be multiple names""
        },
        "cast": {
            "type": "string",
            "description": "The name of the actors in the movie""
        },
        "country": {
            "type": "string",
            "description": "The country where the movie was produced""
        },
    }
}
"""

model_output = get_results_basic(
    attributes = metadata_attributes, 
    query = query_text
)
parser = PydanticOutputParser(pydantic_object=SelfQueryParser)
parser_output = parser.invoke(model_output)

query_text = parser_output.query
metadata_filter = parser_output.filter
print(query_text)
print(metadata_filter)

Comic movie from the 60s about young people going to vacation on a barren island having nightmares about murders
{'$and': [{'year': {'$gt': 1959}}, {'year': {'$lt': 1970}}, {'genre': {'$in': ['comedy']}}]}


In [228]:
results = retriever.query(
    query_text, 
    where = metadata_filter
)
show_results(results)

length of query_embeddings 384
Goodbye Charlie
Bob & Carol & Ted & Alice
The Disorderly Orderly
Everything's Ducky
No My Darling Daughter
