In [28]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains.query_constructor.base import AttributeInfo
from datetime import datetime
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
import os
from langchain.indexes import SQLRecordManager, index

from dotenv import load_dotenv
load_dotenv()

True

In [29]:
# Loading in data from all csv files
loader = DirectoryLoader(
    path="./data",
    glob="*.csv",
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()

metadata_field_info = [
    AttributeInfo(
        name="Title", description="The title of the movie", type="string"),
    AttributeInfo(name="Runtime (minutes)",
                  description="The runtime of the movie in minutes", type="integer"),
    AttributeInfo(name="Language",
                  description="The language of the movie", type="string"),
    AttributeInfo(name="Release Date",
                  description="The release date of the movie", type="date"),
    AttributeInfo(name="Genre", description="The genre of the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Keywords", description="The keywords of the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Actors", description="The actors in the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Directors", description="The directors of the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Stream", description="The streaming platforms for the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Buy", description="The platforms where the movie can be bought",
                  type="string or list[string]"),
    AttributeInfo(name="Rent", description="The platforms where the movie can be rented",
                  type="string or list[string]"),
    AttributeInfo(name="Production Companies",
                  description="The production companies of the movie", type="string or list[string]"),
    AttributeInfo(name="Website",
                  description="The website of the movie", type="string"),
]

def convert_to_list(doc, field):
    if field in doc.metadata and doc.metadata[field] is not None:
        doc.metadata[field] = [item.strip()
                               for item in doc.metadata[field].split(',')]

fields_to_convert = ['Genre', 'Actors', 'Directors',
                     'Keywords', 'Production Companies', 'Stream', 'Buy', 'Rent']

# Set 'overview' as 'page_content' and other fields as 'metadata'
for i, doc in enumerate(docs):
    # Parse the page_content string into a dictionary
    page_content_dict = dict(line.split(": ", 1)
                             for line in doc.page_content.split("\n") if ": " in line)
    
    # Set 'overview' as 'page_content' and other fields as 'metadata'
    doc.page_content = page_content_dict.get('Overview')
    doc.metadata = {field.name: page_content_dict.get(field.name) for field in metadata_field_info}

    # Convert 'Runtime (minutes)' from string to integer
    if 'Runtime (minutes)' in doc.metadata and doc.metadata['Runtime (minutes)'] is not None:
        doc.metadata['Runtime (minutes)'] = int(
            doc.metadata['Runtime (minutes)'])

    # Convert 'Release Date' from string to date
    # if 'Release Date' in doc.metadata and doc.metadata['Release Date'] is not None:
    #     doc.metadata['Release Date'] = datetime.strptime(
    #         doc.metadata['Release Date'], '%Y-%m-%d').date()
        
    # Convert fields from string to list of strings
    for field in fields_to_convert:
        convert_to_list(doc, field)        

100%|██████████| 5/5 [00:00<00:00, 335.78it/s]


In [30]:
print(docs[0])

page_content='Vic and Melinda Van Allen are a couple in the small town of Little Wesley. Their loveless marriage is held together only by a precarious arrangement whereby, in order to avoid the messiness of divorce, Melinda is allowed to take any number of lovers as long as she does not desert her family.' metadata={'Title': 'Deep Water', 'Runtime (minutes)': 116, 'Language': 'English', 'Release Date': '2022-03-18', 'Genre': ['Drama', 'Mystery', 'Thriller'], 'Keywords': ['based on novel or book', 'marriage', 'dysfunctional marriage', 'murder', 'cuckold', 'erotic thriller', 'cuckolded husband', 'psychological', 'snails'], 'Actors': ['Lil Rel Howery', 'Tracy Letts', 'Ana de Armas', 'Dash Mihok', 'Ben Affleck'], 'Directors': ['Adrian Lyne'], 'Stream': ['Hulu'], 'Buy': [''], 'Rent': [''], 'Production Companies': ['New Regency Pictures', 'Entertainment 360', 'Film Rites', 'Keep Your Head Productions'], 'Website': 'https://www.hulu.com/movie/deep-water-2c4ae82e-2c1b-41d5-a651-82189d1c8b2c'}


In [31]:
index_name = "film-bot-index"

# Create empty index
PINECONE_KEY, PINECONE_INDEX_NAME = os.getenv(
    'PINECONE_API_KEY'), os.getenv('PINECONE_INDEX_NAME')

pc = Pinecone(api_key=PINECONE_KEY)

# Uncomment if index is not created already
# pc.create_index(
#     name="film-bot-index",
#     dimension=1536,
#     metric="cosine",
#     spec=PodSpec(
#         environment="gcp-starter"
#     )
# )

# Target index and check status
pc_index = pc.Index(index_name)
print(pc_index.describe_index_stats())

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

vectorstore = PineconeVectorStore(
    pc_index, embeddings
)

# Create record manager
namespace = f"pinecone/{index_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

record_manager.create_schema()

{'dimension': 1536,
 'index_fullness': 0.00506,
 'namespaces': {'': {'vector_count': 506}},
 'total_vector_count': 506}


In [43]:
def _clear():
    """
    Hacky helper method to clear content.
    """
    index([], record_manager, vectorstore,
          cleanup="full", source_id_key="Website")

# Uncomment this line if you want to clear the Pinecone vectorstore
_clear()

# Upload documents to pinecome
index(docs, record_manager, vectorstore,
      cleanup="full", source_id_key="Website")

{'num_added': 494, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [61]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

document_content_description = "Brief summary of a movie"

# Define allowed comparators list
allowed_comparators = [
    "$eq",  # Equal to (number, string, boolean)
    "$ne",  # Not equal to (number, string, boolean)
    "$gt",  # Greater than (number)
    "$gte",  # Greater than or equal to (number)
    "$lt",  # Less than (number)
    "$lte",  # Less than or equal to (number)
    "$in",  # In array (string or number)
    "$nin",  # Not in array (string or number)
]

constructor_prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
    allowed_comparators=allowed_comparators,
)


chat_model = ChatOpenAI(
    # model='gpt-3.5-turbo-0125',
    model='gpt-4-0125-preview',
    temperature=0,
    streaming=True,
)

output_parser = StructuredQueryOutputParser.from_components()
query_constructor = constructor_prompt | chat_model | output_parser

In [64]:
question = "Recommend some films by Yorgos Lanthimos."

print(constructor_prompt.format(query=question))
print(type(constructor_prompt))

Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` ($eq | $ne | $gt | $gte | $lt | $lte | $in | $nin): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or | not): logica

In [65]:
query_constructor.invoke(
    {
        "query": question
    }
)

StructuredQuery(query='films by Yorgos Lanthimos', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='Directors', value='Yorgos Lanthimos'), limit=None)

In [66]:
from langchain.retrievers.self_query.pinecone import PineconeTranslator

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=PineconeTranslator(),
)

In [67]:
retriever.invoke(
    question
)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

def format_docs(docs):
    return "\n\n".join(f"{doc.page_content}\n\nMetadata: {doc.metadata}" for doc in docs)

chat_model = ChatOpenAI(
    model='gpt-3.5-turbo-0125',
    # model='gpt-4-0125-preview',
    temperature=0,
    streaming=True,
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            """
            Your goal is to recommend films to users based on their query and the retrieved context. Only recommend films 
            shown in your context. If a film doesn't seem relevant, omit it from your response. You should recommend no more than five films. 
            Your recommendation should be original, insightful, and at least two to three sentences long. Determine whether the query
            would best be answered using full text search or semantic search before picking your tool.

            # TEMPLATE FOR OUTPUT
            - [Title of Film](source link):
                - Runtime:
                - Release Date:
                - (Your reasoning for recommending this film)
            
            Question: {question} 
            Context: {context} 
            """
        ),
    ]
)

# Create a chatbot Question & Answer chain from the retriever
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | chat_model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

question = "Recommend some Yorgos Lanthimos films."

query_response = query_constructor.invoke(
    {
        "query": question
    }
)

print(query_response)

# Only prints final answer
# for chunk in rag_chain_with_source.stream(question):
#     for key in chunk:
#         if key == 'answer':
#             print(chunk[key], end="", flush=True)

# Prints everything
output = {}
curr_key = None
for chunk in rag_chain_with_source.stream(question):
    for key in chunk:
        if key not in output:
            output[key] = chunk[key]
        else:
            output[key] += chunk[key]
        if key != curr_key:
            print(f"\n\n{key}: {chunk[key]}", end="", flush=True)
        else:
            print(chunk[key], end="", flush=True)
        curr_key = key
output

query='Yorgos Lanthimos' filter=None limit=None


question: Recommend some Yorgos Lanthimos films.

context: [Document(page_content='World-famous detective Benoit Blanc heads to Greece to peel back the layers of a mystery surrounding a tech billionaire and his eclectic crew of friends.', metadata={'Actors': ['Leslie Odom Jr.', 'Daniel Craig', 'Edward Norton', 'Kathryn Hahn', 'Janelle Monáe'], 'Buy': [''], 'Directors': ['Rian Johnson'], 'Genre': ['Comedy', 'Crime', 'Mystery'], 'Keywords': ['detective', 'greece', 'investigation', 'mona lisa (la gioconda)', 'satire', 'sequel', 'murder', 'billionaire', 'whodunit', 'puzzle box', 'clue', 'identical twin', 'murder mystery', 'arrested development', 'covid-19'], 'Language': 'English', 'Production Companies': ['T-Street'], 'Release Date': '2022-11-12', 'Rent': [''], 'Runtime (minutes)': 140.0, 'Stream': ['Netflix', 'Netflix basic with Ads'], 'Title': 'Glass Onion: A Knives Out Mystery', 'Website': 'https://glassonionmovie.com'}), Document(page_c

{'question': 'Recommend some Yorgos Lanthimos films.',
 'context': [Document(page_content='World-famous detective Benoit Blanc heads to Greece to peel back the layers of a mystery surrounding a tech billionaire and his eclectic crew of friends.', metadata={'Actors': ['Leslie Odom Jr.', 'Daniel Craig', 'Edward Norton', 'Kathryn Hahn', 'Janelle Monáe'], 'Buy': [''], 'Directors': ['Rian Johnson'], 'Genre': ['Comedy', 'Crime', 'Mystery'], 'Keywords': ['detective', 'greece', 'investigation', 'mona lisa (la gioconda)', 'satire', 'sequel', 'murder', 'billionaire', 'whodunit', 'puzzle box', 'clue', 'identical twin', 'murder mystery', 'arrested development', 'covid-19'], 'Language': 'English', 'Production Companies': ['T-Street'], 'Release Date': '2022-11-12', 'Rent': [''], 'Runtime (minutes)': 140.0, 'Stream': ['Netflix', 'Netflix basic with Ads'], 'Title': 'Glass Onion: A Knives Out Mystery', 'Website': 'https://glassonionmovie.com'}),
  Document(page_content='A stage director and an actress 