In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader=PyPDFLoader(r'docs\Recent_Wars_Extended_Research_Report.pdf')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
texts=loader.load()

In [3]:
from langchain_core.prompts import ChatMessagePromptTemplate,SystemMessagePromptTemplate,ChatPromptTemplate


In [4]:
template = """
You are a highly skilled research agent.

I will provide you with a topic related to ongoing wars.
Your task is to research the topic in depth and provide a well-structured, factual, and analytical response.

Guidelines:
- Length: 400–500 words
- Tone: Academic and neutral
- Structure the answer with clear paragraphs
- Cover background, key developments, military aspects, economic impact, and social/crime-related consequences
- Focus on recent and ongoing conflicts
- Avoid opinions; rely on objective analysis

Topic:
{topic}
"""

system_prompt=SystemMessagePromptTemplate.from_template(template=template)


In [5]:
from langchain_core.prompts import HumanMessagePromptTemplate

human_prompt = HumanMessagePromptTemplate.from_template(
    """Please write the response in a {tone} tone. and provide me a response in 
    Respond ONLY in valid JSON format.

    JSON schema:
    {{
        "title": "string",
        "content": "400-500 word text"
    }}

    Tone: {tone}
    """
)


final_prompt=ChatPromptTemplate(messages=[
    system_prompt,human_prompt
])

In [6]:
from langchain_core.output_parsers import StrOutputParser,PydanticOutputParser
from langchain_groq.chat_models import ChatGroq
from dotenv import load_dotenv



In [7]:
import os
api_key=os.getenv('GROQ_API_KEY')
load_dotenv()

model=ChatGroq(api_key=api_key,model="openai/gpt-oss-120b") #type: ignore

In [8]:
from pydantic import BaseModel
class OutputResponse(BaseModel):
    title:str
    content:str

In [9]:
parser=PydanticOutputParser(pydantic_object=OutputResponse)
chain=final_prompt|model|parser

In [10]:
response=chain.invoke({
    "topic":"Russia-Ukraine War",
    "tone":"Informative and academic"

})

In [11]:
response

OutputResponse(title='The Russia‑Ukraine War: Background, Developments, and Consequences', content='The Russia‑Ukraine war, which escalated into a full‑scale invasion on 24 February 2022, represents the most extensive conventional conflict in Europe since World\u202fII. Its origins lie in the 2014 annexation of Crimea by the Russian Federation and the subsequent support of separatist entities in the Donetsk and Luhansk oblasts. These events created a frozen conflict that persisted for eight years, during which diplomatic efforts such as the Minsk agreements failed to produce a durable cease‑fire. The 2022 invasion was justified by Russian authorities as a “special military operation” aimed at “demilitarising” and “denazifying” Ukraine, while the Ukrainian government and the international community condemned it as an unlawful act of aggression.\n\nKey developments since February 2022 include the rapid initial advance of Russian forces toward Kyiv, Kharkiv, and the southern port cities, 

In [12]:
# Response contains title and content so i have to extract them for each topic in the pdf provided

response

OutputResponse(title='The Russia‑Ukraine War: Background, Developments, and Consequences', content='The Russia‑Ukraine war, which escalated into a full‑scale invasion on 24 February 2022, represents the most extensive conventional conflict in Europe since World\u202fII. Its origins lie in the 2014 annexation of Crimea by the Russian Federation and the subsequent support of separatist entities in the Donetsk and Luhansk oblasts. These events created a frozen conflict that persisted for eight years, during which diplomatic efforts such as the Minsk agreements failed to produce a durable cease‑fire. The 2022 invasion was justified by Russian authorities as a “special military operation” aimed at “demilitarising” and “denazifying” Ukraine, while the Ukrainian government and the international community condemned it as an unlawful act of aggression.\n\nKey developments since February 2022 include the rapid initial advance of Russian forces toward Kyiv, Kharkiv, and the southern port cities, 

In [13]:
titles=[]
for page in texts:
    title=page.page_content.split('\n')[0]
    titles.append(title)

In [14]:
titles

['Recent Wars and Ongoing Conflicts: Extended Global',
 '1. Russia–Ukraine War',
 '2. Israel–Gaza War',
 '3. Sudan Civil War',
 '4. Ethiopia Internal Conflicts',
 '5. Democratic Republic of Congo',
 '6. Myanmar Civil War']

In [15]:
contents=[]
for title in titles:
    response=chain.invoke({
        "topic":title,
        "tone":"Academic and Informative "
    })
    contents.append(response.content)


In [16]:
final_content = {}

for title, content in zip(titles, contents):
    final_content[title] = content


In [17]:
final_content=dict(zip(titles, contents))

In [18]:
final_content

{'Recent Wars and Ongoing Conflicts: Extended Global': 'The first decade of the twenty‑first century has been marked by a proliferation of protracted armed confrontations that span several continents. In Eastern Europe, the Russian invasion of Ukraine, launched in February 2022, escalated a frozen Cold‑War‑era tension into a full‑scale conventional war. In the Middle East and North Africa, the civil war in Syria entered its twelfth year, while Yemen’s Houthi‑backed insurgency and the Saudi‑UAE coalition have produced a humanitarian disaster since 2015. In the Horn of Africa, Ethiopia’s Tigray conflict, reignited in November 2020, merged with broader ethnic clashes in Amhara and Oromia. Sub‑Saharan Africa also witnessed a renewed war in Sudan after the April 2023 power struggle between the Rapid Support Forces and the Sudanese Armed Forces. In South Asia, the insurgency in Myanmar’s border regions intensified following the February 2021 coup. These wars share common drivers—state fragme

In [19]:
def clean_text(text: str) -> str:
    return (
        text
        .replace("\u00a0", " ")   # non-breaking space
        .replace("\u202f", " ")   # narrow no-break space
        .replace("–", "-")        # en dash
        .replace("—", "-")        # em dash
    )


In [20]:
from reportlab.platypus import (
    SimpleDocTemplate,
    Paragraph,
    Spacer,
    PageBreak
)
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import A4


import re

import re

def clean_text_hard(text: str) -> str:
    if not text:
        return ""

    replacements = {
        "\u25A0": " ",   # black square ■
        "\u00a0": " ",
        "\u202f": " ",
        "\u2009": " ",
        "\u2007": " ",
        "\u200b": "",
        "\u2060": "",
        "–": "-",
        "—": "-",
        "‐": "-",
    }

    for bad, good in replacements.items():
        text = text.replace(bad, good)

    # Fix broken words like counter offensives
    text = re.sub(r"\s+", " ", text)

    return text.strip()




def regenerate_pdf_from_dict(content_dict, output_path):
    doc = SimpleDocTemplate(
        output_path,
        pagesize=A4,
        rightMargin=50,
        leftMargin=50,
        topMargin=50,
        bottomMargin=50
    )

    styles = getSampleStyleSheet()

    styles.add(ParagraphStyle(
        name="SectionTitle",
        fontSize=15,
        leading=18,
        spaceAfter=12
    ))

    styles.add(ParagraphStyle(
        name="SectionBody",
        fontSize=11,
        leading=15,
        spaceAfter=10
    ))

    story = []

    for i, (heading, text) in enumerate(content_dict.items()):
        cleaned_heading = clean_text(heading)
        cleaned_text = clean_text(text)

        story.append(
            Paragraph(f"<b>{cleaned_heading}</b>", styles["SectionTitle"])
        )
        story.append(Spacer(1, 10))

        story.append(
            Paragraph(
                cleaned_text.replace("\n", "<br/>"),
                styles["SectionBody"]
            )
        )

        if i != len(content_dict) - 1:
            story.append(PageBreak())

    doc.build(story)


In [21]:
regenerate_pdf_from_dict(
    content_dict=final_content,
    output_path="docs/Recent_Wars_Final_Report_modified.pdf"
)


In [22]:
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader(r'docs\Recent_Wars_Final_Report_modified.pdf')
docs=loader.load()

In [23]:
docs

[Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': '(unspecified)', 'creationdate': '2026-02-04T10:51:35+05:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2026-02-04T10:51:35+05:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'docs\\Recent_Wars_Final_Report_modified.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Recent Wars and Ongoing Conflicts: Extended Global\nThe first decade of the twenty■first century has been marked by a proliferation of protracted armed\nconfrontations that span several continents. In Eastern Europe, the Russian invasion of Ukraine,\nlaunched in February 2022, escalated a frozen Cold■War■era tension into a full■scale\nconventional war. In the Middle East and North Africa, the civil war in Syria entered its twelfth year,\nwhile Yemen’s Houthi■backed insurgency and the Saudi■UAE coalition have produced a\nhumanitarian disaster since 2015. In the Horn of Africa

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=200)

final_chunk_docs=text_splitter.split_documents(documents=docs)


In [25]:
len(final_chunk_docs)

89

In [26]:
final_docs=[]
for doc in final_chunk_docs:
    cleaned_text = clean_text_hard(doc.page_content)
    if cleaned_text.strip():   # empty check
        final_docs.append(cleaned_text)

In [27]:
final_docs = [
    clean_text_hard(doc.page_content)
    for doc in final_chunk_docs
    if doc.page_content.strip()
]


In [28]:
from sentence_transformers import SentenceTransformer
from typing import List

model = SentenceTransformer("all-MiniLM-L6-v2")

def generate_embeddings(model,final_docs:List[str]):
    embeddings = model.encode(final_docs)
    return embeddings

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 157.39it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [29]:
from pinecone import Pinecone

load_dotenv()


pinecone_api_key=os.getenv('PINECONE_API_KEY')
pc=Pinecone(
    api_key=pinecone_api_key
)

In [30]:
index = pc.Index(
    name="vectordb",
    pool_threads=50,             # <-- make sure to set these
    connection_pool_maxsize=50,  # <-- make sure to set these
)

In [32]:
# vectors = []

# for i, (text, embedding) in enumerate(zip(final_docs, embeddings)):
#     if not text.strip():
#         continue

#     vectors.append(
#         (
#             f"doc-{i}",
#             embedding.tolist(),
#             {
#                 "text": text,
#                 "source": "pdf",
#                 "chunk_id": i
#             }
#         )
#     )


In [33]:
def batch_upsert(index, vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        index.upsert(vectors=vectors[i:i+batch_size])


In [35]:
# batch_upsert(index, vectors)


In [36]:
# index.describe_index_stats()


In [37]:
# query = "impact of wars on global economy"
# q_vec = model.encode([query])[0]

# res = index.query(
#     vector=q_vec.tolist(),
#     top_k=5,
#     include_metadata=True
# )

# for m in res["matches"]:
#     print(m["score"], m["metadata"]["text"][:100])


In [38]:
def retrieve_query(query:str):
    vectors=model.encode([query])[0]
    res = index.query(
    vector=vectors.tolist(),
    top_k=5,
    include_metadata=True
)
    return res['matches'] #type: ignore

In [39]:
def join_context(matches):
    texts = []

    for match in matches:
        metadata = match.get("metadata", {})
        text = metadata.get("text")
        if text:
            texts.append(text)

    return "\n\n".join(texts)


In [40]:
import os
api_key=os.getenv('GROQ_API_KEY')
load_dotenv()

llm=ChatGroq(api_key=api_key,model="openai/gpt-oss-120b") #type: ignore

In [41]:
template="""

You are a smart and honest student.

You must answer questions using ONLY the information provided in the context.
You are NOT allowed to use any external knowledge, assumptions, or guesses.

If the answer is NOT explicitly present in the context,
you must reply exactly with:
"I don't know the answer."

Do not add anything else.

"""
system_prompt=SystemMessagePromptTemplate.from_template(template=template)

In [42]:
template="""

Context:
{context}

Question:
{question}

"""

human_prompt=HumanMessagePromptTemplate.from_template(template=template)

In [43]:
final_prompt=ChatPromptTemplate(
    messages=[
        system_prompt,human_prompt
    ]
)

In [44]:
chain=final_prompt|llm|StrOutputParser()

### Generating Pipeline

In [45]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


def document_loader(path:str):
    loader=PyPDFLoader(path)
    docs=loader.load()
    return docs

In [46]:
from typing import List
from langchain_core.documents import Document
def splitting_the_text(docs:List[Document]):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=200)
    final_chunk_docs=text_splitter.split_documents(documents=docs)
    return final_chunk_docs




In [47]:
def formatting(final_chunk_docs:List[Document]):
    final_docs=[]
    for doc in final_chunk_docs:
        cleaned_text = clean_text_hard(doc.page_content)
        if cleaned_text.strip():   # empty check
            final_docs.append(cleaned_text)
    return final_docs


In [48]:
def generate_embeddings(model,final_docs:List[str]):
    embeddings = model.encode(final_docs)
    return embeddings


def insertion_in_pinecone(final_docs:List[str],embeddings):
    vectors = []

    for i, (text, embedding) in enumerate(zip(final_docs, embeddings)):
        if not text.strip():
            continue

        vectors.append(
            (
                f"doc-{i}",
                embedding.tolist(),
                {
                    "text": text,
                    "source": "pdf",
                    "chunk_id": i
                }
            )
        )
    return vectors
        

# index will be given by init_pinecone function  and insertion of vectors will be given by above function 

def batch_upsert(index, vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        index.upsert(vectors=vectors[i:i+batch_size])

### Retrieval Pipeline

In [90]:
def retrieve_query(query:str):
    vectors=model.encode([query])[0]
    res = index.query(
    vector=vectors.tolist(),
    top_k=10,
    include_metadata=True
)
    return res['matches'] #type: ignore

In [91]:
def join_context(matches):
    texts = []

    for match in matches:
        metadata = match.get("metadata", {})
        text = metadata.get("text")
        if text:
            texts.append(text)

    return "\n\n".join(texts)


In [92]:
from sentence_transformers import SentenceTransformer
from typing import List

model = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 271.25it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [93]:
docs=document_loader(r"docs\Recent_Wars_Final_Report_modified.pdf")
final_chunk_docs=splitting_the_text(docs=docs)
final_docs=formatting(final_chunk_docs=final_chunk_docs)


embeddings=generate_embeddings(model=model,final_docs=final_docs)
vectors=insertion_in_pinecone(final_docs=final_docs,embeddings=embeddings)

# Insertion in vector db 
batch_upsert(index, vectors=vectors, batch_size=100)



In [94]:
retrieved=retrieve_query(query="What are the global impacts of war particularly in Ukraine?")
context=join_context(retrieved)


chain=final_prompt|llm|StrOutputParser()

In [95]:
response=chain.invoke({
    "question":"What are the global impacts of war particularly in Ukraine?",
    "context":context
})

In [96]:
print(response)

The war in Ukraine has had wide‑reaching global effects. Economically, it has strained national budgets, deterred investment and pushed up commodity prices; the blockade of Ukrainian grain has raised food costs and created food‑security concerns across the Middle East and Africa, while sanctions on Russian banks, energy exports and high‑technology imports have reshaped markets and reduced Russia’s access to finance. Ukraine’s GDP fell by an estimated 30 % in 2022, and the disruption of its agricultural exports has reverberated through global supply chains. Socially, the conflict has displaced more than 30 million people, spurred gender‑based violence, child‑soldier recruitment, a surge in organized crime, illicit mining and smuggling, and has strained mental‑health services with a marked rise in post‑traumatic stress disorder among civilians and combatants. These economic and societal impacts extend far beyond the battlefield, influencing security calculations and policy responses acro

In [97]:
def rag_pipeline(
    pdf_path: str,
    query: str,
    model,
    index,
    llm,
    batch_size: int = 100
):
    # 1. Load
    docs = document_loader(pdf_path)

    # 2. Split
    final_chunk_docs = splitting_the_text(docs=docs)

    # 3. Format
    final_docs = formatting(final_chunk_docs=final_chunk_docs)

    # 4. Embeddings
    embeddings = generate_embeddings(
        model=model,
        final_docs=final_docs
    )

    # 5. Insert
    vectors = insertion_in_pinecone(
        final_docs=final_docs,
        embeddings=embeddings
    )

    batch_upsert(index, vectors=vectors, batch_size=batch_size)

    # 6. Retrieve
    retrieved = retrieve_query(query=query)

    # 7. Context
    context = join_context(retrieved)

    # 8. LLM Chain
    chain = final_prompt | llm | StrOutputParser()
    response = chain.invoke({"context": context, "question": query})

    return response


In [98]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

load_dotenv()

index_name=os.getenv("INDEX_NAME")
index = pc.Index(
    name="vectordb",
    pool_threads=50,             # <-- make sure to set these
    connection_pool_maxsize=50,  # <-- make sure to set these
)

answer = rag_pipeline(
    pdf_path=r"docs\Recent_Wars_Final_Report_modified.pdf",
    query="What are the global impacts of war particularly in Ukraine?",
    model=model,
    index=index,
    llm=llm
)


In [99]:
print(answer)

The war in Ukraine has produced worldwide economic and social repercussions.  It has strained national budgets, discouraged investment and pushed up commodity prices, while Russia‑focused sanctions have reshaped global markets.  Ukraine’s grain blockade has lifted food costs and threatened food security in the Middle East and Africa.  Ukraine’s GDP fell about 30 % in 2022, and the disruption of agricultural exports has reverberated through international supply chains.  Socially, the conflict has generated more than 30 million displaced people, increased gender‑based violence, child‑soldier recruitment, organized crime, illicit mining and a surge in PTSD among civilians and combatants, all of which affect regional stability beyond Ukraine’s borders.


In [100]:
answer = rag_pipeline(
    pdf_path=r"docs\Recent_Wars_Final_Report_modified.pdf",
    query="",
    model=model,
    index=index,
    llm=llm
)


In [101]:
rag_evaluation_set = {
    "Q1": {
        "question": "What common structural drivers link the conflicts in Ukraine, Syria, Yemen, Ethiopia, Sudan, and Myanmar, and how do these drivers manifest differently across regions?",
        "answer": (
            "The conflicts share common structural drivers including state fragmentation, external patronage, "
            "and competition over resources. In Ukraine, this manifests through external military aid and sanctions "
            "shaping the battlefield. In Syria and Yemen, external patronage sustains proxy warfare. Ethiopia and "
            "Sudan reflect fragmentation through ethnic and military factionalism, while Myanmar combines political "
            "fragmentation after a coup with long-standing ethnic autonomy struggles."
        )
    },

    "Q2": {
        "question": "Which conflicts experienced significant escalation after 2020, and what triggering events caused this escalation?",
        "answer": (
            "Ukraine escalated in February 2022 following Russia’s full-scale invasion. Ethiopia’s Tigray conflict "
            "reignited in November 2020 due to tensions between the federal government and the TPLF. Myanmar’s "
            "civil war escalated after the February 1, 2021 military coup. Sudan’s conflict intensified after the "
            "April 2023 power struggle between the RSF and SAF. The DRC saw renewed violence between 2020 and 2024 "
            "due to the resurgence of M23 and expansion of the ADF."
        )
    },

    "Q3": {
        "question": "Which advanced military systems are explicitly mentioned in the Ukraine and Syria conflicts, and how do they differ in operational purpose?",
        "answer": (
            "In Ukraine, Russia employs long-range precision artillery such as the 9M729, while Ukraine relies on "
            "mobile drone fleets and Western-supplied air defense systems like Patriot and SAMP/T. In Syria, "
            "Russian S-300 air defense batteries integrated with Iranian-backed forces provide layered deterrence. "
            "Ukraine’s systems emphasize mobility and defense, while Syria’s focus on airspace denial."
        )
    },

    "Q4": {
        "question": "What numerical estimates are provided for displacement and economic loss across the conflicts?",
        "answer": (
            "Global displacement exceeds 30 million people. In the DRC, over 5 million people are internally "
            "displaced, mining export revenues declined by about 15% between 2020 and 2023, and armed groups divert "
            "around US$200 million annually. In Myanmar, GDP contracted by roughly 6% in 2023, about 7 million "
            "people face food insecurity, and over 1.2 million are internally displaced."
        )
    },

    "Q5": {
        "question": "How do armed conflicts disrupt resource-based economies in Ukraine, the DRC, and Myanmar?",
        "answer": (
            "In Ukraine, the war disrupted grain exports, contributing to higher global food prices. In the DRC, "
            "armed groups compete for control over coltan, tin, and gold, diverting revenues through illicit "
            "taxation. In Myanmar, jade and gemstone industries are increasingly controlled by armed groups, "
            "fueling a shadow economy and reducing state revenue."
        )
    },

    "Q6": {
        "question": "Compare the effectiveness and limitations of international mediation or intervention efforts in Yemen and the DRC.",
        "answer": (
            "In Yemen, the UN-mediated Stockholm Agreement remains largely unenforced with repeated ceasefire "
            "violations. In the DRC, MONUSCO conducts joint operations with the FARDC but is constrained by limited "
            "mandates, poor coordination, and criticism over civilian protection. Both cases show diplomatic "
            "fatigue and weak enforcement."
        )
    },

    "Q7": {
        "question": "How do asymmetric tactics interact with conventional military strategies in at least three conflicts discussed?",
        "answer": (
            "In Ukraine, Russian conventional combined-arms operations contrast with Ukraine’s asymmetric drone "
            "warfare. In Ethiopia, conventional heavy armor is combined with militia-based infantry tactics. In "
            "Myanmar, the Tatmadaw uses conventional artillery and airpower, while PDFs and EAOs rely on guerrilla "
            "tactics, ambushes, and IEDs, creating hybrid warfare environments."
        )
    },

    "Q8": {
        "question": "Identify two distinct social consequences unique to the DRC and Myanmar conflicts respectively.",
        "answer": (
            "In the DRC, severe gender-based violence, child soldier recruitment, and disease outbreaks in "
            "displacement camps are prominent. In Myanmar, education disruption for millions of children, forced "
            "labor, and rising refugee flows to Thailand and India are emphasized."
        )
    },

    "Q9": {
        "question": "Explain how sanctions indirectly affect battlefield dynamics in at least two conflicts.",
        "answer": (
            "Sanctions on Russia reshaped markets and constrained logistics, affecting battlefield sustainability. "
            "In Myanmar, sanctions targeting military leaders and revenue streams worsened logistical constraints, "
            "contributed to defections, lowered morale, and reduced the Tatmadaw’s operational effectiveness."
        )
    },

    "Q10": {
        "question": "Does the text indicate that MONUSCO conducts offensive operations without restrictions?",
        "answer": (
            "No. The text explicitly states that MONUSCO’s operations are constrained by mandates that limit "
            "offensive actions, which has reduced its effectiveness in protecting civilians."
        )
    }
}


In [102]:
answers = {}

for q_id, qa in rag_evaluation_set.items():
    question_text = qa["question"]
    gold_answer = qa["answer"]

    rag_answer = rag_pipeline(
        pdf_path=r"docs\Recent_Wars_Final_Report_modified.pdf",
        query=question_text,
        model=model,
        index=index,
        llm=llm
    )

    answers[q_id] = {
        "question": question_text,
        "original_answer": gold_answer,
        "rag_answer": rag_answer
    }


In [103]:
answers['Q1']

{'question': 'What common structural drivers link the conflicts in Ukraine, Syria, Yemen, Ethiopia, Sudan, and Myanmar, and how do these drivers manifest differently across regions?',
 'original_answer': 'The conflicts share common structural drivers including state fragmentation, external patronage, and competition over resources. In Ukraine, this manifests through external military aid and sanctions shaping the battlefield. In Syria and Yemen, external patronage sustains proxy warfare. Ethiopia and Sudan reflect fragmentation through ethnic and military factionalism, while Myanmar combines political fragmentation after a coup with long-standing ethnic autonomy struggles.',
 'rag_answer': 'The conflicts are linked by three overarching structural drivers that appear repeatedly in the text:\n\n* **State fragmentation** – the breakdown or weakening of central authority.  \n* **External patronage** – the involvement of foreign powers that supply weapons, troops or political support.  \n* 

In [104]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

evaluation_set = {}

for q_id, det in answers.items():
    original_answer = det["original_answer"]
    rag_answer = det["rag_answer"]

    # Encode answers
    oa_emb = model.encode(original_answer)
    rg_emb = model.encode(rag_answer)

    # Make embeddings 2D
    oa_emb = np.array(oa_emb).reshape(1, -1)
    rg_emb = np.array(rg_emb).reshape(1, -1)

    # Cosine similarity
    similarity_score = cosine_similarity(oa_emb, rg_emb)[0][0]

    # Store evaluation result
    evaluation_set[q_id] = {
        "question": det["question"],
        "original_answer": original_answer,
        "rag_answer": rag_answer,
        "cosine_similarity": float(similarity_score)
    }


In [105]:
for q_id, det in evaluation_set.items():
    score = det["cosine_similarity"]
    print(f"{q_id} → cosine similarity: {score:.4f}")


Q1 → cosine similarity: 0.7098
Q2 → cosine similarity: 0.8428
Q3 → cosine similarity: 0.7773
Q4 → cosine similarity: 0.7135
Q5 → cosine similarity: 0.6744
Q6 → cosine similarity: 0.0368
Q7 → cosine similarity: 0.7708
Q8 → cosine similarity: 0.0311
Q9 → cosine similarity: 0.7229
Q10 → cosine similarity: 0.8438


In [None]:
q=['Q6','Q8']
a=[]
for qu in q:
    rag_answer = rag_pipeline(
            pdf_path=r"docs\Recent_Wars_Final_Report_modified.pdf",
            query=evaluation_set[qu]['question'],
            model=model,
            index=index,
            llm=llm
        )
    a.append(rag_answer)


In [115]:
rag_answer = rag_pipeline(
            pdf_path=r"docs\Recent_Wars_Final_Report_modified.pdf",
            query=evaluation_set['Q6']['question'],
            model=model,
            index=index,
            llm=llm
        )

In [116]:
rag_answer

"I don't know the answer."