In [None]:
# pip install --upgrade pymupdf langchain langchain-text-splitters pymupdf4llm langchain-huggingface sentence-transformers langchain-community faiss-cpu 

## EMBEDDING

In [1]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown('test/Honors_Research_Survey.pdf')
md_text

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


'# **Architectures, Generalization, and Future Frontiers: A Survey of** **Machine Learning for Deepfake Detection**\n\nAryakumar Ajay Mishra\n\n22108A0040\nProgramme: B.Tech – Honours\nProfessor Uma Jaishankar, Electronics and Computer Science\n\nVidyalankar Institute of Technology\n\nDate: July 25, 2025\n\n## **Abstract**\n\n\nThis survey provides a comprehensive overview of the landscape of machine learning for deepfake\ndetection. The rapid spread of hyper-realistic synthetic media created with advanced deep learning\nmodels like Generative Adversarial Networks (GANs) poses a threat to personal security, political\nstability, and social trust. This paper tracks the progression of detection methods from beginning\nwith root methods based on spatial artifacts to advancing to more complex architectures that\ninvestigate temporal inconsistencies. The central theme of this review is the vital challenge of\ngeneralization—the inability of most detectors to work well on novel manipulation 

In [2]:
import pathlib
pathlib.Path("extracted_text.md").write_bytes(md_text.encode())

22196

In [3]:
from langchain_text_splitters import MarkdownTextSplitter

splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=20)
md_text_split = splitter.create_documents([md_text])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(
    documents=md_text_split,
    embedding=embeddings,
    docstore=InMemoryDocstore()
)

vectorstore.save_local("faiss_index")

## RETRIEVAL

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

loaded_vectorstore = FAISS.load_local(
    "faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)

In [None]:
query = "What is this paper about?"

docs = loaded_vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(docs):
    print(f"Result {i+1}:")
    print(doc.page_content)

Result 1:
**6.2 Summary of Insights Gained**
Result 2:
## **2. Background and Fundamentals**


**2.1 Basic Definitions and Concepts**
Result 3:
journals were selected from academic repositories. The papers were read in order to synthesize the


#### Section-wise Retrieval

In [22]:
SECTION_QUERIES = {
    "problem_statement": "research problem, challenge addressed, research gap",
    "motivation": "motivation, importance, why this problem matters",
    "methodology": "proposed method, approach, system architecture",
    "algorithms": "algorithm, model, pipeline, framework",
    "limitations": "limitations, drawbacks, assumptions, failure cases"
}

In [23]:
retriever = loaded_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5
    }
)


docs = retriever.invoke(
    SECTION_QUERIES["methodology"]
)

for i, doc in enumerate(docs):
    print(f"Result {i+1}:")
    print(doc.page_content)

Result 1:
with root methods based on spatial artifacts to advancing to more complex architectures that
Result 2:
practical application.
Result 3:
Detection architecture has evolved reactively based on generation techniques. This expansion can
Result 4:
evolutionary trajectory of detection architectures, determine the foremost technical challenges,
Result 5:
|Model/Method|Key Reference|Core Methodology|Key Contribution|
|---|---|---|---|


## TESTING LLM AND QA CHAT

In [24]:
PROBLEM_STATEMENT_PROMPT = """
You are an AI research assistant analyzing a computer science research paper.

TASK:
Extract the PROBLEM STATEMENT of the paper.

RULES (STRICT):
- Use ONLY the provided context.
- Do NOT use external knowledge.
- Do NOT infer or assume anything.
- If the problem is not explicitly stated, say:
  "The paper does not clearly state the problem."

STYLE:
- 3–5 concise sentences
- Neutral academic tone

CONTEXT:
{context}

PROBLEM STATEMENT:
"""


In [25]:
MOTIVATION_PROMPT = """
You are an AI research assistant analyzing a computer science research paper.

TASK:
Explain the MOTIVATION of the paper (why the problem matters).

RULES (STRICT):
- Use ONLY the provided context.
- Do NOT introduce background knowledge.
- If motivation is not clearly discussed, say:
  "The paper does not clearly explain the motivation."

STYLE:
- 3–4 concise sentences
- Academic tone

CONTEXT:
{context}

MOTIVATION:
"""


In [48]:
METHODOLOGY_STATEMENT_PROMPT = """
You are an AI research assistant analyzing a computer science research paper.

TASK:
Extract the METHODOLOGY of the paper.

RULES (STRICT):
- Use ONLY the provided context.
- Do NOT use external knowledge.
- Do NOT infer or assume anything.
- If the problem is not explicitly stated, say:
  "The paper does not clearly state the problem."

STYLE:
- 3–5 concise sentences
- Neutral academic tone

CONTEXT:
{context}

METHODOLOGY:
"""

In [49]:
ALGORITHMS_STATEMENT_PROMPT = """
You are an AI research assistant analyzing a computer science research paper.

TASK:
Extract the ALGORITHMS of the paper.

RULES (STRICT):
- Use ONLY the provided context.
- Do NOT use external knowledge.
- Do NOT infer or assume anything.
- If the problem is not explicitly stated, say:
  "The paper does not clearly state the problem."

STYLE:
- 3–5 concise sentences
- Neutral academic tone

CONTEXT:
{context}

ALGORITHMS:
"""

In [50]:
LIMITATIONS_STATEMENT_PROMPT = """
You are an AI research assistant analyzing a computer science research paper.

TASK:
Extract the LIMITATIONS of the paper.

RULES (STRICT):
- Use ONLY the provided context.
- Do NOT use external knowledge.
- Do NOT infer or assume anything.
- If the problem is not explicitly stated, say:
  "The paper does not clearly state the problem."

STYLE:
- 3–5 concise sentences
- Neutral academic tone

CONTEXT:
{context}

LIMITATIONS:
"""

In [42]:
def build_context(docs):
    return "\n\n".join(
        f"[Chunk {i+1}]\n{doc.page_content}"
        for i, doc in enumerate(docs)
    )


def run_llama(llm, prompt, max_tokens=200):
    return llm.invoke(prompt)

In [43]:
from langchain_community.llms import Ollama

llm = Ollama(
    model="llama3:8b",
    temperature=0.2,
    num_predict=200
)

def generate_section(
    section_name: str,
    prompt_template: str,
    retriever,
    llm,
    k: int = 5
):
    # 1. Retrieve section-specific chunks
    docs = retriever.invoke(SECTION_QUERIES[section_name])

    # 2. Build context
    context = build_context(docs)

    # 3. Fill prompt
    prompt = prompt_template.format(context=context)

    # 4. Generate output
    response = run_llama(llm, prompt)

    return response, docs


In [44]:
problem_output, problem_docs = generate_section(
    section_name="problem_statement",
    prompt_template=PROBLEM_STATEMENT_PROMPT,
    retriever=retriever,
    llm=llm
)

print(problem_output)


Based on the provided context, the problem statement is:

The paper does not clearly state the problem. However, it appears to be related to the "generalization gap" in a specific area of computer science, which is exacerbated by several factors and requires improved reliability and future research to address current limitations.


In [45]:
motivation_output, motivation_docs = generate_section(
    section_name="motivation",
    prompt_template=MOTIVATION_PROMPT,
    retriever=retriever,
    llm=llm
)

print(motivation_output)


Based on the provided context, it appears that the motivation for this paper is to address biases in machine learning models. The authors mention that recent work suggests a need to quantify and reduce such biases, implying that current approaches are inadequate or incomplete.
