In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
Groq_API_key = os.getenv("Groq_API_key")

In [4]:
pd.set_option('display.max_colwidth',None)

In [5]:
os.environ["Groq_API_key"] = Groq_API_key

In [6]:
df = pd.read_csv("../Data/wiki_movie_plots_deduped.csv")

In [7]:
df.head(1)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"


In [8]:
df = df.iloc[:500]

In [9]:
df.shape

(500, 8)

In [10]:
# df.head()

In [11]:
df_selected = df[["Title","Plot"]]

In [12]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,       # ~300 words (aproximates one word have 5 charters)
    chunk_overlap=250,     # ~50 words overlap
    separators=["\n\n", "\n", ".", " ", ""]
)

In [13]:
def row_to_text(row):
    return f"Movie Name: {row['Title']} | Description: {row['Plot']}"

In [14]:
all_chunks = []
for id, row in df_selected.iterrows():
    text = row_to_text(row)
    chunks = splitter.split_text(text) 

    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "id": f"{id}_{i}",
            "text":chunk,
            "metadata":{"title":row["Title"]}
        })

In [15]:
texts = [chunk["text"] for chunk in all_chunks]
metadatas = [chunk["metadata"] for chunk in all_chunks]
ids = [chunk["id"] for chunk in all_chunks]

In [16]:
embeddings = HuggingFaceEmbeddings()

  embeddings = HuggingFaceEmbeddings()
  embeddings = HuggingFaceEmbeddings()


In [17]:
persistance_directly = 'movie_db'

vectordb = Chroma.from_texts(
    texts=texts,
    embedding = embeddings,
    metadatas = metadatas,
    ids = ids,
    persist_directory = persistance_directly
)

In [18]:
retriever  = vectordb.as_retriever()

In [19]:
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0
)

In [20]:
prompt = ChatPromptTemplate.from_template("""
Answer the question based on the following context:

Context: {context}

Question: {question}

Provide your response in the following JSON format:
{{
  "answer": "Your complete answer to the question",
  "contexts": ["Relevant excerpt from context 1", "Relevant excerpt from context 2"],
  "reasoning": "Explanation of how you used the context to answer the question"
}}

Important: 
- Return ONLY valid JSON, no additional text or markdown formatting.
- In the contexts array, include the most relevant excerpts from the provided context.
- Make the reasoning clear about which parts of the context were used.
""")

In [21]:
# Create the chain
qa_chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [29]:
answer = qa_chain.invoke("What is the plot of The Night Before Christmas?")
print(answer)

{
  "answer": "The plot of The Night Before Christmas involves Santa Claus preparing for Christmas by feeding his reindeer and finishing his work in the workshop. Meanwhile, children in a city household hang their stockings and go to bed, but they have trouble sleeping and engage in a pillow fight. Santa then leaves his home on a sleigh with his reindeer, enters the children's house through the chimney, and leaves them presents. The children wake up and enjoy their presents.",
  "contexts": [
    "Movie Name: The Night Before Christmas | Description: Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop.",
    "Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents."
  ],
  

In [25]:
# The Martyred Presidents
# The Night Before Christmas
