In [1]:
import os
import base64
import langchain
# from langchain.document_loaders.csv_loader import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [2]:
langchain.debug = True

In [3]:
with open("../data/secrets.json") as secrets:
    secrets_dict = eval(secrets.read())
    open_api_key = base64.b64decode(secrets_dict["openai_api_key"]).decode('ascii')
    os.environ["OPENAI_API_KEY"] = open_api_key
    if "organization_id" in secrets_dict.keys():
        openai_organization = base64.b64decode(secrets_dict["organization_id"]).decode('ascii')
        os.environ["OPENAI_ORGANIZATION"] = openai_organization

In [4]:
import csv
from typing import Dict, List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class CSVLoader(BaseLoader):
    """Loads a CSV file into a list of documents.

    Each document represents one row of the CSV file. Every row is converted into a
    key/value pair and outputted to a new line in the document's page_content.

    The source for each document loaded from csv is set to the value of the
    `file_path` argument for all documents by default.
    You can override this by setting the `source_column` argument to the
    name of a column in the CSV file.
    The source of each document will then be set to the value of the column
    with the name specified in `source_column`.

    Output Example:
        .. code-block:: txt

            column1: value1
            column2: value2
            column3: value3
    """

    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns_dtypes: Optional[Dict[str, str]] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        """

        Args:
            file_path: The path to the CSV file.
            source_column: The name of the column in the CSV file to use as the source.
              Optional. Defaults to None.
            metadata_columns_dtypes: Name of column as keys and data type as values.
              Optional. Defaults to None.
            csv_args: A dictionary of arguments to pass to the csv.DictReader.
              Optional. Defaults to None.
            encoding: The encoding of the CSV file. Optional. Defaults to None.
        """
        self.file_path = file_path
        self.source_column = source_column
        self.metadata_columns_dtypes = metadata_columns_dtypes
        self.encoding = encoding
        self.csv_args = csv_args or {}

    def load(self) -> List[Document]:
        """Load data into document objects."""

        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
            for i, row in enumerate(csv_reader):
                content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
                try:
                    source = (
                        row[self.source_column]
                        if self.source_column is not None
                        else self.file_path
                    )
                except KeyError:
                    raise ValueError(
                        f"Source column '{self.source_column}' not found in CSV file."
                    ) 
                
                # metadata = {"source": source, "row": i}
                metadata = {}
                if self.metadata_columns_dtypes:
                    for k, v in row.items():
                        if k in self.metadata_columns_dtypes.keys():
                            if self.metadata_columns_dtypes[k] in ["int", "float"]:
                                v = eval(v)
                            metadata.update({k: v})
                
                doc = Document(page_content=content, metadata=metadata)
                docs.append(doc)

        return docs

In [11]:
# Place your cv in data folder and load here
loader = CSVLoader('../data/movies_title_overview_vote.csv', metadata_columns_dtypes={"vote_average": "float"})

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)

In [13]:
# Creating the cv database for document indexing
index_creator = VectorstoreIndexCreator(text_splitter=text_splitter)
docsearch = index_creator.from_loaders([loader])

# Self query retriever

## Definition

In [14]:

llm_model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=1000)
# llm_model = ChatOpenAI(temperature=0, model="gpt-4", max_tokens=1000)

In [15]:
metadata_field_info = [
    AttributeInfo(
        name="vote_average",
        description="The average score given to the movie.",
        type="float",
    )
]
document_content_description = "List of movies with an overview and scoring from a public website."

# Explore use_original_query arg and figure if prompt template can be changed.
sq_retriever = SelfQueryRetriever.from_llm(
    llm=llm_model,
    vectorstore=docsearch.vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    enable_limit=False,
    verbose=True,
    search_kwargs={"k": 20}
)

## Explore internal prompts and processes

In [16]:
print(sq_retriever.llm_chain.prompt.examples)

[{'i': 1, 'data_source': '```json\n{{\n    "content": "Lyrics of a song",\n    "attributes": {{\n        "artist": {{\n            "type": "string",\n            "description": "Name of the song artist"\n        }},\n        "length": {{\n            "type": "integer",\n            "description": "Length of the song in seconds"\n        }},\n        "genre": {{\n            "type": "string",\n            "description": "The song genre, one of "pop", "rock" or "rap""\n        }}\n    }}\n}}\n```', 'user_query': 'What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre', 'structured_request': '```json\n{{\n    "query": "teenager love",\n    "filter": "and(or(eq(\\"artist\\", \\"Taylor Swift\\"), eq(\\"artist\\", \\"Katy Perry\\")), lt(\\"length\\", 180), eq(\\"genre\\", \\"pop\\"))"\n}}\n```'}, {'i': 2, 'data_source': '```json\n{{\n    "content": "Lyrics of a song",\n    "attributes": {{\n        "artist": {{\n            "type": "str

In [17]:
sq_retriever.llm_chain.prompt.input_variables

['query']

In [27]:
print(sq_retriever.llm_chain.prompt.format_prompt(query="I want to watch a movie about outer space exploration."))

text='Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):  name of attribute to apply the comparison to\n- `val` (string): is the comparison value\n\nA logical operation statement takes the form `op(statement1, statement2, ...)`:\n- `op` (and | or): 

In [23]:
prompt_eip, stop_eip = sq_retriever.llm_chain.prep_prompts(
    [{"query": "I want to watch a movie about outer space exploration."}])

In [24]:
prompt_eip

[StringPromptValue(text='Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):  name of attribute to apply the comparison to\n- `val` (string): is the comparison value\n\nA logical operation statement takes the form `op(statement1, statement2, ...)`:\n

In [25]:
resp_eip = sq_retriever.llm_chain.llm.generate_prompt(prompt_eip, stop_eip, callbacks=None)

[32;1m[1;3m[llm/start][0m [1m[1:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents\n    \"filter\": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):  name of attribute to apply the comparison to\n- `val` (string): is the compariso

In [26]:
sq_retriever.llm_chain.create_outputs(resp_eip)[0]

{'text': '```json\n{\n    "query": "outer space exploration",\n    "filter": "NO_FILTER"\n}\n```'}

In [28]:
sq_retriever.get_relevant_documents("I want to watch a movie about outer space exploration.")



[32;1m[1;3m[chain/start][0m [1m[1:retriever:Retriever > 2:chain:LLMChain] Entering Chain run with input:
[0m{
  "query": "I want to watch a movie about outer space exploration."
}
[32;1m[1;3m[llm/start][0m [1m[1:retriever:Retriever > 2:chain:LLMChain > 3:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents\n    \"filter\": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation 

[Document(page_content='title: Sphere\noverview: The OSSA discovers a spacecraft thought to be at least 300 years old at the bottom of the ocean. Immediately following the discovery, they decide to send a team down to the depths of the ocean to study the space craft.They are the best of best, smart and logical, and the perfect choice to learn more about the spacecraft.\nvote_average: 5.8', metadata={'vote_average': 5.8}),
 Document(page_content="title: Space Battleship Yamato\noverview: In 2199, five years after the Gamilons began an invasion of Earth, the planet has been ravaged by the aliens' bombs. The remnants of humanity have fled underground to escape the irradiated surface. One day, former pilot Susumu Kodai discovers a capsule sent from the planet Iscandar that tells of a device that can remove the radiation from the Earth's surface. The Earth Defense Force rebuilds the battleship Yamato with a new type of propulsion system to make the 148,000 light year trip to Iscandar in hop

# Retrieval QA Chain

## Definition

In [29]:
template = """Use the following movies data to find the best matches for the user request in the question overview topic. Rules:
- You can return more than one movie if they are a good match. 
- Answer with movie names of the medias and a short text justifying the choice.
- The justification must take into account overview topic matching and vote average score (higher=better).

Media data:
{context}

Question overview topic: 
{question}

Example answer:
1. Title: [selected movie title]
- Justification: [Given justification]
- Score: [vote_average]

Movies attending to rules and ordered from best to worst:
"""

alt_retrieval_prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [30]:
agent_llm_model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=1000)
# agent_llm_model = ChatOpenAI(temperature=0, model="gpt-4", max_tokens=1000)

In [31]:
growwer_media = RetrievalQA.from_chain_type(llm=agent_llm_model, chain_type="stuff", retriever=sq_retriever, chain_type_kwargs={"prompt": alt_retrieval_prompt})

## Testing

In [32]:
# Simple question
question = "A movie about outer space exploration with rating over 5"
response = growwer_media.run(question)
print(response)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "A movie about outer space exploration with rating over 5"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 3:chain:LLMChain] Entering Chain run with input:
[0m{
  "query": "A movie about outer space exploration with rating over 5"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 3:chain:LLMChain > 4:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Your goal is to structure the user's query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    \"query\": string \\ text string to compare to document contents\n    \"filter\": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is e

In [33]:
print(response)

1. Interstellar
- Justification: Interstellar is a highly acclaimed movie about outer space exploration with a vote average of 8.1, making it the highest-rated movie in the list. It explores the concept of interstellar travel and the challenges faced by a group of explorers. The high vote average indicates that it is a well-regarded movie in this genre.
- Score: 8.1

2. Gravity
- Justification: Gravity is another popular movie about outer space exploration with a vote average of 7.3. It tells the story of two astronauts stranded in space after their shuttle is destroyed. The movie received critical acclaim for its stunning visuals and intense storytelling, making it a good choice for someone interested in outer space exploration.
- Score: 7.3

3. 2001: A Space Odyssey
- Justification: 2001: A Space Odyssey is a classic movie in the genre of outer space exploration. With a vote average of 7.9, it is highly regarded for its groundbreaking visuals and thought-provoking storyline. It delve