In [157]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain_openai import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain_openai import ChatOpenAI # Import OpenAI LLM
from langchain_core.vectorstores import InMemoryVectorStore
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders import PyPDFLoader

import pandas as pd
from pathlib import Path
import json
from dotenv import load_dotenv
import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations
from IPython.display import display, Markdown

In [158]:
# %pip install pypdf

In [159]:
# Load environment variables from .env file
load_dotenv()

True

In [160]:
input_datapath = "../semantic-search/dataset.json"

with open(input_datapath, 'r') as f:
    movie_data = json.load(f)

df = pd.DataFrame(movie_data)
print(df.shape)
df.head()

(10, 10)


Unnamed: 0,title,release_date,genres,original_language,vote_average,overview,tagline,combined,n_tokens,embedding
0,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,"Father Gabriele Amorth, Chief Exorcist of the ...",Inspired by the actual files of Father Gabriel...,"The Pope's Exorcist Father Gabriele Amorth, Ch...",67,"[0.0099146804, -0.0019374829, -0.0009720114, -..."
1,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,Super-Hero partners Scott Lang and Hope van Dy...,Witness the beginning of a new dynasty.,Ant-Man and the Wasp: Quantumania Super-Hero p...,84,"[0.0057371012, -0.017788643, 0.0122131966, -0...."
2,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,Salt-of-the-earth Cole falls head over heels f...,Finding that special someone can be a real adv...,Ghosted Salt-of-the-earth Cole falls head over...,65,"[0.0011030367, -0.018147951, 0.0101844044, -0...."
3,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,"Billy Batson and his foster siblings, who tran...",Oh. My. Gods.,Shazam! Fury of the Gods Billy Batson and his ...,62,"[-0.0110116974, -0.0404475406, -0.0210291967, ..."
4,Avatar: The Way of Water,2022-12-14,"['Science Fiction', 'Adventure', 'Action']",English,7.7,Set more than a decade after the events of the...,Return to Pandora.,Avatar: The Way of Water Set more than a decad...,72,"[-0.0010760396, -0.0292616803, -0.0164514501, ..."


In [161]:
df.columns 

Index(['title', 'release_date', 'genres', 'original_language', 'vote_average',
       'overview', 'tagline', 'combined', 'n_tokens', 'embedding'],
      dtype='object')

In [162]:
import ast

documents = []
for index, row in df.iterrows():
    genres = ast.literal_eval(row['genres'])
    md_dict = {
        "language": row['original_language'], 
        "genre": genres[0], 
        "release_date": row['release_date'],
        "source": index
    }
    doc = Document(id=index, page_content=row['title']+" - "+row['overview'], metadata=md_dict)
    documents.append(doc)
print(len(documents), "documents")

# Initialize text splitter with specified parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, # Size of each chunk in characters
    chunk_overlap=100, # Overlap between consecutive chunks
)

chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

10 documents
Split 10 documents into 13 chunks.


In [163]:
chunks

[Document(metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content="Ant-Man and the Wasp: Quantumania - Super-Hero partners Scott Lang and Hope van Dyne, along with with Hope's parents Janet van Dyne and Hank Pym, and Scott's daughter Cassie Lang, find themselves exploring the Quantum Realm, interacting with strange new creatures and embarking on an adventure that"),
 Document(metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content='the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them b

In [164]:
vectorstore = InMemoryVectorStore(OpenAIEmbeddings())
_ = vectorstore.add_documents(documents=chunks)



In [165]:
def _filter_function(doc: Document) -> bool:
    return doc.metadata.get("genre") == 'Horror'
    
query = "Something about religion"
# results = vectorstore.similarity_search(query, k=2)
results = vectorstore.similarity_search_with_score(query, k=2)
# results = vectorstore.similarity_search_with_score(query, k=2, filter=_filter_function)

results

[(Document(id='cad165b6-4775-47e5-ac6d-638708990a91', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
  0.7915740511648672),
 (Document(id='d06a19d3-1c34-49e2-951e-93da5a9fa694', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content='the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them beyond the limits of what they thought possible.'),
  0.7628955232654321)]

In [166]:
retriever = vectorstore.as_retriever(
    search_kwargs={
        'k': 3
    }
)

retriever.invoke(input=query)

[Document(id='cad165b6-4775-47e5-ac6d-638708990a91', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='d06a19d3-1c34-49e2-951e-93da5a9fa694', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content='the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them beyond the limits of what they thought possible.'),
 Document(id='70ac3d1e-a5ce-4503-86e8-59fe099322bd', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are f

In [167]:
retriever.invoke(input=query)

[Document(id='cad165b6-4775-47e5-ac6d-638708990a91', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='d06a19d3-1c34-49e2-951e-93da5a9fa694', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content='the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them beyond the limits of what they thought possible.'),
 Document(id='70ac3d1e-a5ce-4503-86e8-59fe099322bd', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are f

In [168]:
llm_model = os.environ["OPENAI_MODEL"]
print(llm_model)
llm = ChatOpenAI(model=llm_model, temperature=0.1)


gpt-4o-mini


In [169]:
# Example for a public prompt (https://smith.langchain.com/hub/rlm/rag-prompt)
rag_prompt = hub.pull("rlm/rag-prompt", include_model=True)
rag_prompt.messages[0].prompt



PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

In [170]:
import pprint
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

query = "I want to get a movie about religion"
result = rag_chain.invoke(query)
# pprint.pprint(result)
display(Markdown(result))

You might consider watching "The Pope's Exorcist," which revolves around Father Gabriele Amorth investigating a young boy's possession and uncovering a hidden Vatican conspiracy. This film explores themes of faith and the supernatural within a religious context.

In [171]:
path = "./data/Understanding_Climate_Change.pdf"

loader = PyPDFLoader(path)
pdf_documents = loader.load()

In [172]:
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': './data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 0, 'page_label': '1'}, page_content='Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. \nHistorical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n1

In [175]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents
    
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=len)
texts = text_splitter.split_documents(pdf_documents)
cleaned_texts = replace_t_with_space(texts)

In [177]:
vectorstore = Chroma.from_documents(cleaned_texts, OpenAIEmbeddings())
my_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [178]:
def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.invoke(question)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]

    return context

def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")



test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question(test_query, my_retriever)
show_context(context)

Context 1:
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous 
oxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential 
for life on Earth, as it keeps the planet warm enough to support life. However, human 
activities have intensified this natural process, leading to a warmer climate. 
Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and 
natural gas used for electricity, heating, and transportation. The industrial revolution marked 
the beginning of a significant increase in fossil fuel consumption, which continues to rise 
today. 
Coal


Context 2:
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch,

In [179]:
import pprint
rag_chain = (
    {"context": my_retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke(test_query)
# pprint.pprint(result)
display(Markdown(result))

The main cause of climate change is the increase in greenhouse gases in the atmosphere, primarily due to human activities. These gases, such as carbon dioxide, methane, and nitrous oxide, trap heat from the sun, intensifying the natural greenhouse effect. The burning of fossil fuels for energy is a significant contributor to this increase.

In [None]:
# TODO: How to add the citations (sources)?