# Imports

In [18]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import DirectoryLoader
from langchain.indexes import SQLRecordManager, index
from pinecone import Pinecone, PodSpec
import os

from dotenv import load_dotenv
load_dotenv()

True

# Loading Data

In [19]:
loader = DirectoryLoader(
    path="./data", 
    glob="*.csv", 
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()

print(docs[10].page_content[:1000])

100%|██████████| 2/2 [00:00<00:00, 10.44it/s]

The Voyagers: Top Gun: Maverick
108: 131
it: en
Time jump to 1939 with teenager Max and his friends as they try to navigate Mussolini's Rome and find Max's missing brother.: After more than thirty years of service as one of the Navy’s top aviators, and dodging the advancement in rank that would ground him, Pete “Maverick” Mitchell finds himself training a detachment of TOP GUN graduates for a specialized mission the likes of which no living pilot has ever seen.
2022-11-21: 2022-05-24
Fantasy, Adventure, TV Movie: Action, Drama
time travel, zeitreise: fighter pilot, u.s. navy, sequel, nuclear weapons, military
: Top Gun: Maverick is a 2022 American action drama film directed by Joseph Kosinski and written by Ehren Kruger, Eric Warren Singer, and Christopher McQuarrie from stories by Peter Craig and Justin Marks. The film is a sequel to the 1986 film Top Gun. Tom Cruise reprises his starring role as the naval aviator Maverick. It is based on the characters of the original film created by




# Splitting Data

In [20]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [21]:
print(splits[0].page_content)

The Voyagers: The OctoGames
108: 88
it: en
Time jump to 1939 with teenager Max and his friends as they try to navigate Mussolini's Rome and find Max's missing brother.: Eight contestants compete in eight deadly, classic children's games. They seek fame beyond their wildest dreams, competing for the chance to take over the YouTube channel of the famous yet elusive masked content creator known only as "JaxPro".
2022-11-21: 2022-10-07
Fantasy, Adventure, TV Movie: Thriller, Action, Horror
time travel, zeitreise: 遊戲, 挑戰極限, 組團打怪
:


# Creating Embeddings and Uploading to Pinecone

In [22]:
index_name = "film-bot-index"

# Create empty index
PINECONE_KEY, PINECONE_INDEX_NAME = os.getenv(
    'PINECONE_API_KEY'), os.getenv('PINECONE_INDEX_NAME')

pc = Pinecone(api_key=PINECONE_KEY)

# Uncomment if index is not created already
# pc.create_index(
#     name="film-bot-index",
#     dimension=1536,
#     metric="cosine",
#     spec=PodSpec(
#         environment="gcp-starter"
#     )
# )

# Target index and check status
pc_index = pc.Index(index_name)
print(pc_index.describe_index_stats())

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

vectorstore = PineconeVectorStore(
    pc_index, embeddings
)

# Create record manager
namespace = f"pinecone/{index_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

record_manager.create_schema()

{'dimension': 1536,
 'index_fullness': 0.00218,
 'namespaces': {'': {'vector_count': 218}},
 'total_vector_count': 218}


In [23]:
def _clear():
    """
    Hacky helper method to clear content.
    """
    index([], record_manager, vectorstore,
          cleanup="full", source_id_key="source")

# Uncomment this line if you want to clear the Pinecone vectorstore
# _clear()

index(splits, record_manager, vectorstore,
      cleanup="full", source_id_key="source")

{'num_added': 5316, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [24]:
query = "Movies based on novels or books."
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

Films
Batman (1966 film), based on the 1966 television series
Batman (1989 film)
Batman (serial), a 1943 film serial
The Batman (film), a 2022 film

Television
Batman (TV series), 1966–1968 television series
The Adventures of Batman, a 1968–1969 animated series (aka: The Batman/Superman Hour and Batman with Robin the Boy Wonder)
Batman: The Animated Series, a 1992–1995 animated series
The Batman (TV series), a 2000s animated series
Batman: The Brave and the Bold, a 2008–2011 animated series


# Creating a Retriever

In [25]:
retriever = vectorstore.as_retriever(
    search_type="similarity", search_kwargs={"k": 6})

In [26]:
retrieved_docs = retriever.invoke(
    "What are some films with about a person who becomes a tyrannical president of Panem?")

In [27]:
len(retrieved_docs)

6

In [28]:
print(retrieved_docs[0].page_content)

Plot
In dystopian, war-ravaged Panem, Snow family patriarch, General Crassus Snow, dies in the First Rebellion between the Capitol and thirteen Districts. Years later, Crassus' son, Coriolanus, is one of twenty-four Academy students selected to mentor a 10th Annual Hunger Games tribute. Coriolanus hopes for the Plinth Prize scholarship to restore his family's prosperity. Games creator and Academy dean, Casca Highbottom, advises the mentors to focus on entertainment rather than tributes winning. Coriolanus' wealthy friend and fellow mentor, Sejanus Plinth, resents the Games' cruelty.


# Creating an LLM with Retriever

In [29]:
llm = ChatOpenAI(model_name="gpt-4-0613", temperature=0)

prompt_template = PromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks. Use the following pieces 
    of retrieved context to answer the question. If you don't know the answer, 
    just say that you don't know. Use four to five sentences maximum and keep 
    the answer concise.

    Question: {question} 

    Context: {context} 

    Answer:

    """
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [40]:
for chunk in rag_chain.stream("Can you recommend films that deal with womanhood?"):
    print(chunk, end="", flush=True)

I'm sorry, I don't have specific information on films that deal with womanhood based on the provided context.