#### RAG APP

##### IMPORTS

In [1]:
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

True

In [3]:
# os.environ['CURL_CA_BUNDLE'] = ''
# os.environ['REQUESTS_CA_BUNDLE'] = ''

##### MODELS LOADING

In [4]:
from langchain_openai import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")

In [5]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

##### DATA PREPARATION

In [6]:
PDF_FILE_PATH = r"report2\report2.pdf"

##### Load PDF and Create vector store

In [7]:
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader(PDF_FILE_PATH)

In [8]:
docs = loader.load()

In [9]:
docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2023-04-03T11:42:59-07:00', 'source': 'report2\\report2.pdf', 'file_path': 'report2\\report2.pdf', 'total_pages': 8, 'format': 'PDF 1.7', 'title': 'Guidance Note on The Implications of Cyber Threats for Humanitarians', 'author': 'UN OCHA Centre for Humanitarian Data', 'subject': '', 'keywords': '', 'moddate': '2023-04-05T14:07:56-04:00', 'trapped': '', 'modDate': "D:20230405140756-04'00'", 'creationDate': "D:20230403114259-07'00'", 'page': 0}, page_content='OCHA CENTRE FOR HUMANITARIAN DATA\n1\nMARCH 2023\nINTRODUCTION\nHumanitarian organizations rely more than ever on digital technologies to assist and protect people in \ncrisis.1 These technologies enable humanitarians to gather data to understand and respond to the needs of \naffected people, and offer new channels to deliver aid through unprecedented digital proximity.2 However, \nthis increased digitalization 

In [10]:
docs

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2023-04-03T11:42:59-07:00', 'source': 'report2\\report2.pdf', 'file_path': 'report2\\report2.pdf', 'total_pages': 8, 'format': 'PDF 1.7', 'title': 'Guidance Note on The Implications of Cyber Threats for Humanitarians', 'author': 'UN OCHA Centre for Humanitarian Data', 'subject': '', 'keywords': '', 'moddate': '2023-04-05T14:07:56-04:00', 'trapped': '', 'modDate': "D:20230405140756-04'00'", 'creationDate': "D:20230403114259-07'00'", 'page': 0}, page_content='OCHA CENTRE FOR HUMANITARIAN DATA\n1\nMARCH 2023\nINTRODUCTION\nHumanitarian organizations rely more than ever on digital technologies to assist and protect people in \ncrisis.1 These technologies enable humanitarians to gather data to understand and respond to the needs of \naffected people, and offer new channels to deliver aid through unprecedented digital proximity.2 However, \nthis increased digitalization

In [11]:
docs[0].page_content

'OCHA CENTRE FOR HUMANITARIAN DATA\n1\nMARCH 2023\nINTRODUCTION\nHumanitarian organizations rely more than ever on digital technologies to assist and protect people in \ncrisis.1 These technologies enable humanitarians to gather data to understand and respond to the needs of \naffected people, and offer new channels to deliver aid through unprecedented digital proximity.2 However, \nthis increased digitalization is not without risks—chief among them is the growing risk of cyber threats.3 \nCyber threats are one of the most pressing issues facing the humanitarian sector today.4 Digital \ntransformation, increasing dependence on information and communications technology (ICT), and the \nprevalence of cyber threats create a new array of risks for humanitarian agencies and the people they serve. \nA few examples include:\n1     NetHope (2022). Humanitarians (and data) #NotATarget.\n2    Massimo Marelli and Adrian Perrig (2020). Hacking Humanitarians: Mapping The Cyber Environment And Threa

In [12]:
import pprint
pprint.pp(docs[0].metadata)

{'producer': 'Adobe PDF Library 15.0',
 'creator': 'Adobe InDesign CC 13.0 (Macintosh)',
 'creationdate': '2023-04-03T11:42:59-07:00',
 'source': 'report2\\report2.pdf',
 'file_path': 'report2\\report2.pdf',
 'total_pages': 8,
 'format': 'PDF 1.7',
 'title': 'Guidance Note on The Implications of Cyber Threats for '
          'Humanitarians',
 'author': 'UN OCHA Centre for Humanitarian Data',
 'subject': '',
 'keywords': '',
 'moddate': '2023-04-05T14:07:56-04:00',
 'trapped': '',
 'modDate': "D:20230405140756-04'00'",
 'creationDate': "D:20230403114259-07'00'",
 'page': 0}


In [13]:
all_content = ""

for doc in docs:
    all_content += doc.page_content


In [14]:
all_content

'OCHA CENTRE FOR HUMANITARIAN DATA\n1\nMARCH 2023\nINTRODUCTION\nHumanitarian organizations rely more than ever on digital technologies to assist and protect people in \ncrisis.1 These technologies enable humanitarians to gather data to understand and respond to the needs of \naffected people, and offer new channels to deliver aid through unprecedented digital proximity.2 However, \nthis increased digitalization is not without risks—chief among them is the growing risk of cyber threats.3 \nCyber threats are one of the most pressing issues facing the humanitarian sector today.4 Digital \ntransformation, increasing dependence on information and communications technology (ICT), and the \nprevalence of cyber threats create a new array of risks for humanitarian agencies and the people they serve. \nA few examples include:\n1     NetHope (2022). Humanitarians (and data) #NotATarget.\n2    Massimo Marelli and Adrian Perrig (2020). Hacking Humanitarians: Mapping The Cyber Environment And Threa

In [15]:
with open(r"report2/all_content.txt", "w") as f:
    f.write(all_content)

In [16]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [17]:
# Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

In [18]:
texts

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2023-04-03T11:42:59-07:00', 'source': 'report2\\report2.pdf', 'file_path': 'report2\\report2.pdf', 'total_pages': 8, 'format': 'PDF 1.7', 'title': 'Guidance Note on The Implications of Cyber Threats for Humanitarians', 'author': 'UN OCHA Centre for Humanitarian Data', 'subject': '', 'keywords': '', 'moddate': '2023-04-05T14:07:56-04:00', 'trapped': '', 'modDate': "D:20230405140756-04'00'", 'creationDate': "D:20230403114259-07'00'", 'page': 0}, page_content='OCHA CENTRE FOR HUMANITARIAN DATA\n1\nMARCH 2023\nINTRODUCTION\nHumanitarian organizations rely more than ever on digital technologies to assist and protect people in \ncrisis.1 These technologies enable humanitarians to gather data to understand and respond to the needs of \naffected people, and offer new channels to deliver aid through unprecedented digital proximity.2 However, \nthis increased digitalization

In [19]:
# Initialize Chroma vector store with a persist_directory
persist_directory = r"report2/report2_chroma_db"
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding_function,
    persist_directory=persist_directory
)


In [20]:
# Persist the Chroma database to disk
vectordb.persist()

  vectordb.persist()


##### Load Vector DB

In [21]:
# Load the persisted Chroma database
persist_directory = r"report2/report2_chroma_db"
loaded_vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

  loaded_vectordb = Chroma(


In [22]:
retriever = loaded_vectordb.as_retriever(search_type="mmr", search_kwargs = {"k": 3})

In [23]:
# Test Retriever
retriever.invoke("What does Cyber warfare mean?")

[Document(metadata={'moddate': '2023-04-05T14:07:56-04:00', 'title': 'Guidance Note on The Implications of Cyber Threats for Humanitarians', 'file_path': 'sample-data\\report2\\report2.pdf', 'creationDate': "D:20230403114259-07'00'", 'page': 2, 'trapped': '', 'subject': '', 'modDate': "D:20230405140756-04'00'", 'author': 'UN OCHA Centre for Humanitarian Data', 'format': 'PDF 1.7', 'source': 'sample-data\\report2\\report2.pdf', 'creationdate': '2023-04-03T11:42:59-07:00', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'producer': 'Adobe PDF Library 15.0', 'total_pages': 8, 'keywords': ''}, page_content='OCHA CENTRE FOR HUMANITARIAN DATA\n3\nMARCH 2023\n15  Denial-of-service (DoS) and Distributed denial-of-service (DDoS) attacks flood a system’s resources, overwhelming them and preventing responses to \n     service requests, which reduces the system’s ability to perform. (Source: IBM).\n16  Malware is malicious software that can render infected systems inoperable. Most malware variant

In [24]:
def format_docs(docs): 
    return "\n\n".join(doc.page_content for doc in docs)

In [25]:
question = "What does Cyber warfare mean?"

In [26]:
retrieved_docs = retriever.invoke(question)

In [27]:
retrieved_docs

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'source': 'sample-data\\report2\\report2.pdf', 'subject': '', 'title': 'Guidance Note on The Implications of Cyber Threats for Humanitarians', 'format': 'PDF 1.7', 'creationDate': "D:20230403114259-07'00'", 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2023-04-03T11:42:59-07:00', 'modDate': "D:20230405140756-04'00'", 'author': 'UN OCHA Centre for Humanitarian Data', 'keywords': '', 'page': 2, 'file_path': 'sample-data\\report2\\report2.pdf', 'moddate': '2023-04-05T14:07:56-04:00', 'trapped': '', 'total_pages': 8}, page_content='OCHA CENTRE FOR HUMANITARIAN DATA\n3\nMARCH 2023\n15  Denial-of-service (DoS) and Distributed denial-of-service (DDoS) attacks flood a system’s resources, overwhelming them and preventing responses to \n     service requests, which reduces the system’s ability to perform. (Source: IBM).\n16  Malware is malicious software that can render infected systems inoperable. Most malware variant

In [28]:
retrieval_results = format_docs(retrieved_docs)

In [29]:
retrieval_results

'OCHA CENTRE FOR HUMANITARIAN DATA\n3\nMARCH 2023\n15  Denial-of-service (DoS) and Distributed denial-of-service (DDoS) attacks flood a system’s resources, overwhelming them and preventing responses to \n     service requests, which reduces the system’s ability to perform. (Source: IBM).\n16  Malware is malicious software that can render infected systems inoperable. Most malware variants destroy data by deleting or wiping files critical to \n    the operating system’s ability to run. (Source: IBM).\n17  Ransomware is sophisticated malware that takes advantage of system weaknesses, using strong encryption to hold data or system functionality \n     hostage. Cybercriminals use ransomware to demand payment in exchange for releasing the system. A recent development with ransomware is the \n     add-on of extortion tactics. (Source: IBM).\n18  International Committee of the Red Cross (November 2019). International Humanitarian Law and the Challenges of Contemporary Armed \n    Conflicts: Re

In [30]:
# from rag_evaluations.rag_evaluator.rag_eval import RAGEvaluator

In [31]:
context = retrieval_results
question = question

In [32]:
from langchain_core.prompts import ChatPromptTemplate

template = f""" 
Answer the question based only on the following context: {context}
Question: {question}
"""

In [33]:
template

' \nAnswer the question based only on the following context: OCHA CENTRE FOR HUMANITARIAN DATA\n3\nMARCH 2023\n15  Denial-of-service (DoS) and Distributed denial-of-service (DDoS) attacks flood a system’s resources, overwhelming them and preventing responses to \n     service requests, which reduces the system’s ability to perform. (Source: IBM).\n16  Malware is malicious software that can render infected systems inoperable. Most malware variants destroy data by deleting or wiping files critical to \n    the operating system’s ability to run. (Source: IBM).\n17  Ransomware is sophisticated malware that takes advantage of system weaknesses, using strong encryption to hold data or system functionality \n     hostage. Cybercriminals use ransomware to demand payment in exchange for releasing the system. A recent development with ransomware is the \n     add-on of extortion tactics. (Source: IBM).\n18  International Committee of the Red Cross (November 2019). International Humanitarian Law 

In [34]:
# prompt = ChatPromptTemplate.from_template(template)
# prompt

In [35]:
response = llm.invoke(template)

In [36]:
response

AIMessage(content='Cyber warfare refers to operations against a computer, a computer system or network, or another connected device, conducted through a data stream, when used as means or methods of warfare in the context of an armed conflict.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 42, 'prompt_tokens': 2567, 'total_tokens': 2609, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_62a23a81ef', 'id': 'chatcmpl-BcwN7dtkzS8hyZdbJH7TiCioMH7pF', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--329cc4e0-f096-4632-b7f5-6c2b7e9bae97-0', usage_metadata={'input_tokens': 2567, 'output_tokens': 42, 'total_tokens': 2609, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details':

In [37]:
generated_output = response.content
generated_output

'Cyber warfare refers to operations against a computer, a computer system or network, or another connected device, conducted through a data stream, when used as means or methods of warfare in the context of an armed conflict.'

In [38]:
question

'What does Cyber warfare mean?'

In [39]:
generated_output

'Cyber warfare refers to operations against a computer, a computer system or network, or another connected device, conducted through a data stream, when used as means or methods of warfare in the context of an armed conflict.'

In [40]:
context

'OCHA CENTRE FOR HUMANITARIAN DATA\n3\nMARCH 2023\n15  Denial-of-service (DoS) and Distributed denial-of-service (DDoS) attacks flood a system’s resources, overwhelming them and preventing responses to \n     service requests, which reduces the system’s ability to perform. (Source: IBM).\n16  Malware is malicious software that can render infected systems inoperable. Most malware variants destroy data by deleting or wiping files critical to \n    the operating system’s ability to run. (Source: IBM).\n17  Ransomware is sophisticated malware that takes advantage of system weaknesses, using strong encryption to hold data or system functionality \n     hostage. Cybercriminals use ransomware to demand payment in exchange for releasing the system. A recent development with ransomware is the \n     add-on of extortion tactics. (Source: IBM).\n18  International Committee of the Red Cross (November 2019). International Humanitarian Law and the Challenges of Contemporary Armed \n    Conflicts: Re