# Imports

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import DirectoryLoader
from langchain.indexes import SQLRecordManager, index
from pinecone import Pinecone, PodSpec
import os

from dotenv import load_dotenv
load_dotenv()

True

# Loading Data

In [2]:
loader = DirectoryLoader(
    path="./data", 
    glob="*.csv", 
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()

print(docs[10].page_content[:1000])

100%|██████████| 2/2 [00:00<00:00, 234.36it/s]

The Voyagers: Top Gun: Maverick
108: 131
it: en
Time jump to 1939 with teenager Max and his friends as they try to navigate Mussolini's Rome and find Max's missing brother.: After more than thirty years of service as one of the Navy’s top aviators, and dodging the advancement in rank that would ground him, Pete “Maverick” Mitchell finds himself training a detachment of TOP GUN graduates for a specialized mission the likes of which no living pilot has ever seen.
2022-11-21: 2022-05-24
Fantasy, Adventure, TV Movie: Action, Drama
time travel, zeitreise: fighter pilot, u.s. navy, sequel, nuclear weapons, military





# Splitting Data

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [4]:
print(splits[0].page_content)

The Voyagers: The OctoGames
108: 88
it: en
Time jump to 1939 with teenager Max and his friends as they try to navigate Mussolini's Rome and find Max's missing brother.: Eight contestants compete in eight deadly, classic children's games. They seek fame beyond their wildest dreams, competing for the chance to take over the YouTube channel of the famous yet elusive masked content creator known only as "JaxPro".
2022-11-21: 2022-10-07
Fantasy, Adventure, TV Movie: Thriller, Action, Horror
time travel, zeitreise: 遊戲, 挑戰極限, 組團打怪


# Creating Embeddings and Uploading to Pinecone

In [5]:
index_name = "film-bot-index"

# Create empty index
PINECONE_KEY, PINECONE_INDEX_NAME = os.getenv(
    'PINECONE_API_KEY'), os.getenv('PINECONE_INDEX_NAME')

pc = Pinecone(api_key=PINECONE_KEY)

# Uncomment if index is not created already
# pc.create_index(
#     name="film-bot-index",
#     dimension=1536,
#     metric="cosine",
#     spec=PodSpec(
#         environment="gcp-starter"
#     )
# )

# Target index and check status
pc_index = pc.Index(index_name)
print(pc_index.describe_index_stats())

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

vectorstore = PineconeVectorStore(
    pc_index, embeddings
)

# Create record manager
namespace = f"pinecone/{index_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

record_manager.create_schema()

{'dimension': 1536,
 'index_fullness': 0.00217,
 'namespaces': {'': {'vector_count': 217}},
 'total_vector_count': 217}


In [14]:
def _clear():
    """
    Hacky helper method to clear content. See the `full` 
    mode section to to understand why it works.
    """
    index([], record_manager, vectorstore,
          cleanup="full", source_id_key="source")

# Uncomment this line if you want to clear the Pinecone vectorstore
# _clear()

index(splits, record_manager, vectorstore,
      cleanup="full", source_id_key="source")

{'num_added': 218, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [7]:
query = "Movies based on novels or books."
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

Movie_Name: The Fabelmans
Runtime: 151
Original_Language: en
Overview: Growing up in post-World War II era Arizona, young Sammy Fabelman aspires to become a filmmaker as he reaches adolescence, but soon discovers a shattering family secret and explores how the power of films can help him see the truth.
Release Date: 2022-11-11
Genre: Drama
Keywords: high school, husband wife relationship, arizona, bullying, family relationships, coming of age, anti-semitism, teenage boy, filmmaking, childhood, semi autobiographical, marital problem, big dreams, father son relationship, mother son relationship


# Creating a Retriever

In [8]:
retriever = vectorstore.as_retriever(
    search_type="similarity", search_kwargs={"k": 6})

In [9]:
retrieved_docs = retriever.invoke(
    "What are some films with about a person who becomes a tyrannical president of Panem?")

In [10]:
len(retrieved_docs)

6

In [11]:
print(retrieved_docs[0].page_content)

Migration: The Hunger Games: The Ballad of Songbirds & Snakes
83: 157
en: en
After a migrating duck family alights on their pond with thrilling tales of far-flung places, the Mallard family embarks on a family road trip, from New England, to New York City, to tropical Jamaica.: 64 years before he becomes the tyrannical president of Panem, Coriolanus Snow sees a chance for a change in fortunes when he mentors Lucy Gray Baird, the female tribute from District 12.
2023-12-06: 2023-11-15
Animation, Action, Adventure, Comedy, Family: Drama, Science Fiction, Action
duck, migration, flight, anthropomorphism, family, animal, chef, overprotective father, bird: daughter, based on novel or book, arena, fight, mentor, dystopia, riot, exploration, president, prequel, survival, murder, snow, creature, battle, fight to the death, young adult, origin story, based on young adult novel, death game, teenager


# Creating an LLM with Retriever

In [12]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0)

prompt_template = PromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks. Use the following pieces 
    of retrieved context to answer the question. If you don't know the answer, 
    just say that you don't know. Use four to five sentences maximum and keep 
    the answer concise.

    Question: {question} 

    Context: {context} 

    Answer:

    """
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [13]:
for chunk in rag_chain.stream("Give me a summary of Teenage Mutant Ninja Turtles: Mutant Mayhem"):
    print(chunk, end="", flush=True)

Teenage Mutant Ninja Turtles: Mutant Mayhem follows the Turtle brothers as they venture into the human world to be accepted as normal teenagers through heroic acts with the help of their friend April O'Neil. However, they face a challenge when a crime syndicate unleashes an army of mutants against them. The movie combines elements of animation, comedy, action, and science fiction to create an exciting and nostalgic adventure for fans of the Teenage Mutant Ninja Turtles franchise. The story revolves around sibling relationships, crime-fighting, and the journey of the Turtles to prove themselves as heroes in New York City.