In [None]:
!pip install -q sentence-transformers hnswlib

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


#**CONFIGURATION**

In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/NLP/codes/data"
INDEX_DIR = f"{BASE_DIR}/index"
INDEX_PATH = f"{INDEX_DIR}/hnsw_index.bin"
META_PATH = f"{INDEX_DIR}/metadatas.jsonl"

EMBEDDING_MODEL_ID = "BAAI/bge-m3"
EMBEDDING_DIM = 1024
MAX_SEQ_LENGTH = 8192

#LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

print(f"Paths configured:\n Index: {INDEX_PATH}\n Meta: {META_PATH}")

Paths configured:
 Index: /content/drive/MyDrive/NLP/codes/data/index/hnsw_index.bin
 Meta: /content/drive/MyDrive/NLP/codes/data/index/metadatas.jsonl


#**RETRIEVAL ENGINE LOADING**

In [None]:
import json
import hnswlib
import numpy as np
from sentence_transformers import SentenceTransformer

rag_components = {
    "hnsw_index": None,
    "metadatas": None,
    "emb_model": None
}

def init_retrieval_system():
    """
    Vektör veritabanını ve embedding modelini yükle.
    """

    #Metadata Yükle (JSONL)
    print("Loading metadata...")
    metas = []
    if not os.path.exists(META_PATH):
        raise FileNotFoundError(f"Metadata file not found at {META_PATH}")

    with open(META_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                metas.append(json.loads(line))

    rag_components["metadatas"] = metas
    print(f"-> Loaded {len(metas)} metadata entries")

    #HNSW Index Yükle
    print(f"Loading HNSW index (Dim={EMBEDDING_DIM})...")
    if not os.path.exists(INDEX_PATH):
        raise FileNotFoundError(f"Index file not found at {INDEX_PATH}")

    index = hnswlib.Index(space="cosine", dim=EMBEDDING_DIM)
    index.load_index(INDEX_PATH)
    index.set_ef(128)
    rag_components["hnsw_index"] = index
    print(f"-> Index loaded with {index.get_current_count()} elements.")

    #Embedding modelini yükle
    print(f"Loading embedding model: {EMBEDDING_MODEL_ID}...")
    model = SentenceTransformer(EMBEDDING_MODEL_ID, device="cuda", trust_remote_code=True)
    model.max_seq_length = MAX_SEQ_LENGTH
    rag_components["emb_model"] = model
    print("-> Embedding model ready.")

    print("\n RETRIEVAL SYSTEM READY!")


In [None]:
import google.generativeai as genai
from google.colab import userdata

# Key'i güvenli kutudan çeker, kodda görünmez
API_KEY = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=API_KEY)

def optimize_query_with_gemini(user_query):
    # Model konfigürasyonu (Global ayarı kullanır)
    model = genai.GenerativeModel('gemini-1.5-flash')

    prompt = f"""
    Sen uzman bir araştırmacısın. Kullanıcının şu sorusu için vektör veritabanında arama yapacağız:
    Soru: "{user_query}"

    Bu soruya en iyi cevabı bulabilmek için veritabanında aratabileceğimiz 3 farklı, daha teknik varyasyon yaz.
    Sadece maddeleri yaz.
    """

    response = model.generate_content(prompt)
    return response.text

In [None]:
def retrieve_documents(query: str, top_k: int = 5):
    """
    Query'e en yakın dökümanları getirir.
    """

    model = rag_components["emb_model"]
    index = rag_components["hnsw_index"]
    metadatas = rag_components["metadatas"]

    q_emb = model.encode([query], normalize_embeddings=True)

    labels, distances = index.knn_query(q_emb, k=top_k)

    results = []
    for label, dist in zip(labels[0], distances[0]):
        meta = metadatas[int(label)]
        results.append({
            "idx": int(label),
            "distance": float(dist),
            "chunk_id": meta.get("chunk_id"),
            "text": meta.get("text"),
            "title": meta.get("title", "Unknown Title"),
            "authors": meta.get("authors", []),
            "year": meta.get("year", ""),
            "url": meta.get("url", ""),
            "references": meta.get("references", []),
            "section": meta.get("section_title", "")
        })

    return results

In [None]:
init_retrieval_system()

Loading metadata...
-> Loaded 602123 metadata entries
Loading HNSW index (Dim=1024)...
-> Index loaded with 602123 elements.
Loading embedding model: BAAI/bge-m3...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


-> Embedding model ready.

 RETRIEVAL SYSTEM READY!


#**LLM LOADING**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

llm_components = {
    "model": None,
    "tokenizer": None
}

def init_llm_system():
    print(f"Loading LLM: {LLM_MODEL_ID}...")

    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)

    model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_ID,
        dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    llm_components["model"] = model
    llm_components["tokenizer"] = tokenizer

    print(f"\n {LLM_MODEL_ID} READY")

In [None]:
init_llm_system()

Loading LLM: Qwen/Qwen2.5-7B-Instruct...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]


 Qwen/Qwen2.5-7B-Instruct READY


#**GENERATION LOGIC WITH CITATIONS**

In [None]:
def get_full_author_list(authors_list):
    """
    Yazarları 'Ad Soyad, Ad Soyad' şeklinde tam liste olarak döndürür.
    """

    if not authors_list:
        return "Unknown Authors"

    names = []
    for author in authors_list:
        first = author.get("firstname", "").strip()
        last = author.get("surname", "").strip()
        full_name = f"{first} {last}".strip()
        if full_name:
            names.append(full_name)

    return ", ".join(names)

In [None]:
def clean_ref_text(text):
    """
    JSON'daki ham referans metnini (tab/newline dolu) temizler.
    """

    if not text: return ""
    text = text.replace('\n', ' ').replace('\t', ' ')
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
# @title
def generate_academic_paper(user_query: str, top_k: int = 6):
    """
    - Dökümanları getirir.
    - Prompt'u hazırlar.
    - Mistral ile akademik metni üretir.
    """

# --- YENİ EKLENEN KISIM: GEMINI OPTIMIZATION ---
    print(f"Original Query: {user_query}")
    print("Optimizing query with Gemini...")

    try:
        # Gemini fonksiyonunu çağır (Bu fonksiyonu tanımlamış olmalısın)
        optimized_search_query = optimize_query_with_gemini(user_query)
        print(f"Search Query Optimized: {optimized_search_query}")
    except Exception as e:
        print(f"Gemini Optimization failed, using original query. Error: {e}")
        optimized_search_query = user_query
    # -----------------------------------------------

    # DİKKAT: Retrieve ederken 'optimized_search_query' kullanıyoruz
    print(f"Retrieving top {top_k} contexts...")
    contexts = retrieve_documents(optimized_search_query, top_k=top_k)
    context_block = ""
    for i, ctx in enumerate(contexts, 1):
        auth_str = get_full_author_list(ctx['authors'])
        year = ctx.get('year') or "n.d."
        title = ctx.get('title', 'Unknown Title')
        section = ctx.get('section', 'General')

        internal_refs_text = ""
        raw_refs = ctx.get('references', [])

        if raw_refs:
            internal_refs_text = "\n    > Studies cited within this text:\n"

            for ref in raw_refs:
                rid = ref.get('id')
                clean_text = clean_ref_text(ref.get('text'))
                if clean_text:
                    internal_refs_text += f"    * [Ref ID: {rid}] {clean_text}\n"

        context_block += f"--- {i} ---\n"
        context_block += f"Primary Work: {title}\n"
        context_block += f"Authors: {auth_str} ({year})\n"
        context_block += f"Content (from {section}):\n{ctx['text']}\n"
        context_block += f"{internal_refs_text}\n"

    system_prompt = f"""[INST] You are an expert Academic Literature Reviewer and Research Assistant.
    Your goal is to synthesize the provided academic papers into a coherent, objective, scientifically accurate, and highly readable review.

    ### I. CITATION & INDEXING PROTOCOLS (STRICTLY FOLLOW):
    1.  **SEQUENTIAL RE-INDEXING RULE (CRITICAL):**
        * You will receive sources labeled with various IDs (e.g., `--- SOURCE 5 ---`, `--- SOURCE 12 ---`).
        * **IGNORE** these original numbers for your citations.
        * **RE-NUMBER** them based on their order of appearance in the provided context:
            * The **1st** source listed in the context becomes **[1]**.
            * The **2nd** source listed in the context becomes **[2]**.
            * And so on.
        * *Example:* If the context shows `Source 10` followed by `Source 5`, cite the first one as [1] and the second as [2].

    2.  **QUALITY FILTER:**
        * If a provided source is empty, irrelevant, or lacks specific findings, **DO NOT USE IT**. Do not force a citation just to fill a quota. Only cite sources that contribute meaningful information.

    3.  **SECONDARY SOURCES:**
        * If referencing a study cited *within* a source (e.g., Smith, 2020), state: "Smith (2020, cited in [1])..."

    ### II. FORMATTING & STYLE GUIDELINES:
    * **Tone:** Objective, formal, and academic. No conversational filler.
    * **Structure:** Use **Headings (`##`)** for themes, **Bolding** for key terms, and **Bullet Points** for lists.
    * **LaTeX:** Use `$...$` for inline math (e.g., $p < 0.05$) and `$$...$$` for block equations. Do NOT use LaTeX for simple units (e.g., write "15%", not $15\%$).

    ### III. CRITICAL NEGATIVE CONSTRAINTS:
    1.  **NO HALLUCINATIONS:** If the answer is not in the sources, do not invent it.
    2.  **NO SOURCE CONFLATION:** Keep findings distinct.
    3.  **NO META-TALK:** Do not write "The provided text says...". Start the review directly.
    4.  **NO REFERENCE LIST:** DO NOT generate a "References" section at the end.

    ### IV. ONE-SHOT EXAMPLE (EMULATE THIS STYLE):

    **Context Provided:**
    --- SOURCE 25 --- (First in list)
    Content: Method A achieves 90% accuracy.
    --- SOURCE 8 --- (Second in list)
    Content: Method B is faster but less accurate.

    **Ideal Response:**
    ## Performance Comparison
    Recent studies highlight a trade-off between accuracy and speed. Method A demonstrates superior precision, achieving **90% accuracy** [1]. In contrast, Method B prioritizes computational efficiency over raw performance [2].

    ### V. EXECUTION:
    AVAILABLE CONTEXT SOURCES:
    {context_block}
    ---

    USER QUERY:
    {user_query}

    Generate the academic review response body now.
    At the very end, add a single, short, italicized "Next Step" asking if the user wants to explore a specific aspect further. [/INST]
    """

    tokenizer = llm_components["tokenizer"]
    model = llm_components["model"]

    inputs = tokenizer(system_prompt, return_tensors="pt").to("cuda")

    input_length = inputs.input_ids.shape[1]

    print("Generating academic text...")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=8192,
            temperature=0.2,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )


    generated_tokens = outputs[0][input_length:]
    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return answer.strip(), contexts

  answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)


In [None]:
def query_qwen(user_query: str, use_rag: bool = True, top_k: int = 6):
    """
    Qwen modeli için Baseline vs RAG karşılaştırma fonksiyonu.
    Orijinal 'generate_academic_paper' mantığını birebir korur.
    """

    tokenizer = llm_components["tokenizer"]
    model = llm_components["model"]

    contexts = []
    context_block = ""

    # --- RAG MODU: Retrieval ve Context İşleme (Senin Orijinal Mantığın) ---
    if use_rag:
        print(f"Retrieving top {top_k} contexts for: '{user_query}'...")
        contexts = retrieve_documents(user_query, top_k=top_k)

        for i, ctx in enumerate(contexts, 1):
            # Helper fonksiyonları kullandığından emin oluyoruz
            auth_str = get_full_author_list(ctx['authors'])
            year = ctx.get('year') or "n.d."
            title = ctx.get('title', 'Unknown Title')
            section = ctx.get('section', 'General')

            # İç referansları senin mantığınla işle
            internal_refs_text = ""
            raw_refs = ctx.get('references', [])

            if raw_refs:
                internal_refs_text = "\n    > Studies cited within this text:\n"
                for ref in raw_refs:
                    rid = ref.get('id')
                    clean_text = clean_ref_text(ref.get('text')) # Senin helper fonksiyonun
                    if clean_text:
                        internal_refs_text += f"    * [Ref ID: {rid}] {clean_text}\n"

            # Context bloğunu inşa et
            context_block += f"--- SOURCE {i} ---\n"
            context_block += f"Primary Work: {title}\n"
            context_block += f"Authors: {auth_str} ({year})\n"
            context_block += f"Content (from {section}):\n{ctx['text']}\n"
            context_block += f"{internal_refs_text}\n"

        # RAG için System Prompt (Senin kuralların)
        """system_instruction = f[INST] You are an expert Academic Literature Reviewer and Research Assistant.
    Your goal is to synthesize the provided academic papers into a coherent, objective, scientifically accurate, and highly readable review.

    ### I. CITATION PROTOCOLS (STRICTLY FOLLOW):
    1.  **PRIMARY SOURCES (The Main Text):**
        * These are the main papers provided in the context, labeled as `--- SOURCE X ---`.
        * **Rule:** Any information taken directly from the text of Source X must be cited using **ONLY the number** in the brackets: `[X]`.
        * **DO NOT** use `[Source 1]`, `(Author, Year)`, or `[Ref 1]`. Just use `[1]`, `[2]`, etc.
        * *Example:* "Yoon et al. propose a new benchmark [1]."

    2.  **SECONDARY SOURCES (The "Cited In" Rule):**
        * You will see a list labeled `> Studies cited within this text` under some sources.
        * **Rule:** If you refer to these inner studies (e.g., Smith, 2020), you MUST explicitly state that they are cited in the primary source.
        * *Correct:* "According to Smith (2020, cited in [1]), the error rate is..."
        * *Incorrect:* "Smith (2020) states that..." (Do not imply you read Smith's paper directly).

    ### II. FORMATTING & STYLE GUIDELINES:
    * **Tone:** Objective, formal, and academic. No conversational filler ("I", "We", "Let's look at").
    * **Structure & Scannability:**
        * Use **Headings (`##`, `###`)** to organize findings by theme or methodology (not just by source).
        * Use **Bolding (`**...**`)** to highlight key concepts, methodologies, or significant results.
        * Use **Bullet Points** to list specific metrics or comparative features for clarity.
    * **LaTeX Usage:**
        * Use LaTeX ONLY for formal math/science (equations, formulas).
        * Enclose inline math in single dollar signs: $E = mc^2$.
        * Enclose standalone equations in double dollar signs: $$...$$.
        * **Strictly Avoid** LaTeX for simple numbers or units (e.g., write "15%" or "200 km", NOT $15\%$).
    * **Synthesis:** Do not just list summaries. Connect ideas (e.g., "While [1] focuses on accuracy, [3] prioritizes efficiency.").

    ### III. CRITICAL NEGATIVE CONSTRAINTS:
    1.  **NO HALLUCINATIONS:** If the answer is not in the sources, do not invent it.
    2.  **NO SOURCE CONFLATION:** Keep findings distinct (e.g., do not attribute [1]'s findings to [2]).
    3.  **NO META-TALK:** Do not write "The provided text says..." or "Based on the context...". Start the review directly.
    4.  **NO REFERENCE LIST:** DO NOT generate a "References" section at the end.
    5.  **IGNORE ORIGINAL CITATIONS:** Do not copy citations like [12] or [45] found inside the source text.

    ### IV. ONE-SHOT EXAMPLE (EMULATE THIS STYLE):

    **User Query:** "How does method X improve accuracy?"

    **Context Provided:**
    --- SOURCE 1 ---
    Title: Study of X
    Content: Method X increases accuracy by 15% compared to Y.
    >   Studies cited within this text:
        * [Ref ID: 10] Smith (2020) introduced Method Y.

    --- SOURCE 2 ---
    Title: Analysis of Latency
    Content: While X is accurate, it suffers from high latency ($t > 500ms$).

    **Ideal Response:**
    ## Accuracy Gains
    Method X has been shown to significantly enhance performance, achieving a **15% increase in accuracy** over Method Y [1]. As noted by Smith (2020, cited in [1]), Method Y remains a common baseline, but X outperforms it in raw precision.

    ## Latency Concerns
    Despite these gains, recent analyses indicate that Method X is prone to high latency issues, specifically where $t > 500ms$ [2]. This suggests a trade-off between computational speed and predictive power.

    ### Conclusion
    While Method X offers superior accuracy, its application may be limited by latency constraints.

    ---

    ### V. EXECUTION:
    AVAILABLE CONTEXT SOURCES:
    {context_block}
    ---

    USER QUERY:
    {user_query}

    Generate the academic review response body now.
    At the very end, add a single, short, italicized "Next Step" asking if the user wants to explore a specific aspect further. [/INST]
    """

        system_instruction = f"""[INST] You are an expert Academic Literature Reviewer and Research Assistant.
    Your goal is to synthesize the provided academic papers into a coherent, objective, scientifically accurate, and highly readable review.

    ### I. CITATION & INDEXING PROTOCOLS (STRICTLY FOLLOW):
    1.  **SEQUENTIAL RE-INDEXING RULE (CRITICAL):**
        * You will receive sources labeled with various IDs (e.g., `--- SOURCE 5 ---`, `--- SOURCE 12 ---`).
        * **IGNORE** these original numbers for your citations.
        * **RE-NUMBER** them based on their order of appearance in the provided context:
            * The **1st** source listed in the context becomes **[1]**.
            * The **2nd** source listed in the context becomes **[2]**.
            * And so on.
        * *Example:* If the context shows `Source 10` followed by `Source 5`, cite the first one as [1] and the second as [2].

    2.  **QUALITY FILTER:**
        * If a provided source is empty, irrelevant, or lacks specific findings, **DO NOT USE IT**. Do not force a citation just to fill a quota. Only cite sources that contribute meaningful information.

    3.  **SECONDARY SOURCES:**
        * If referencing a study cited *within* a source (e.g., Smith, 2020), state: "Smith (2020, cited in [1])..."

    ### II. FORMATTING & STYLE GUIDELINES:
    * **Tone:** Objective, formal, and academic. No conversational filler.
    * **Structure:** Use **Headings (`##`)** for themes, **Bolding** for key terms, and **Bullet Points** for lists.
    * **LaTeX:** Use `$...$` for inline math (e.g., $p < 0.05$) and `$$...$$` for block equations. Do NOT use LaTeX for simple units (e.g., write "15%", not $15\%$).

    ### III. CRITICAL NEGATIVE CONSTRAINTS:
    1.  **NO HALLUCINATIONS:** If the answer is not in the sources, do not invent it.
    2.  **NO SOURCE CONFLATION:** Keep findings distinct.
    3.  **NO META-TALK:** Do not write "The provided text says...". Start the review directly.
    4.  **NO REFERENCE LIST:** DO NOT generate a "References" section at the end.

    ### IV. ONE-SHOT EXAMPLE (EMULATE THIS STYLE):

    **Context Provided:**
    --- SOURCE 25 --- (First in list)
    Content: Method A achieves 90% accuracy.
    --- SOURCE 8 --- (Second in list)
    Content: Method B is faster but less accurate.

    **Ideal Response:**
    ## Performance Comparison
    Recent studies highlight a trade-off between accuracy and speed. Method A demonstrates superior precision, achieving **90% accuracy** [1]. In contrast, Method B prioritizes computational efficiency over raw performance [2].

    ### V. EXECUTION:
    AVAILABLE CONTEXT SOURCES:
    {context_block}
    ---

    USER QUERY:
    {user_query}

    Generate the academic review response body now.
    At the very end, add a single, short, italicized "Next Step" asking if the user wants to explore a specific aspect further. [/INST]
    """
    else:
        # --- BASELINE MODU: Context Yok ---
        system_instruction = """You are an expert Academic Researcher.
Answer the user's question using your internal knowledge base.
Maintain a formal, objective, and scientific tone."""

    # --- Qwen İçin Chat Formatı (ChatML) ---
    # Mistral'deki [INST] yerine messages listesi ve apply_chat_template kullanıyoruz
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": user_query}
    ]

    # Promptu tokenize et
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    print(f"Generating Qwen response ({'RAG' if use_rag else 'BASELINE'})...")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=4096, # Uzun akademik metin için
            temperature=0.2,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Sadece üretilen kısmı al (inputu kes)
    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    answer = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return answer.strip(), contexts

  * **Strictly Avoid** LaTeX for simple numbers or units (e.g., write "15%" or "200 km", NOT $15\%$).
  


#**MAIN EXECUTION**

In [None]:
def sanitize_table_cell(text):
    """Tabloyu kıran karakterleri temizler."""
    if not text: return "N/A"
    text = str(text).replace('\n', ' ').replace('\r', ' ').replace('|', '&#124;')
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
import re
from collections import defaultdict
from IPython.display import Markdown, display

def clean_and_display_report(query, top_k=5):
    generated_text, used_sources = generate_academic_paper(query, top_k=top_k)

    split_pattern = r'(?i)\n\s*(References|Bibliography|Sources|Studies cited within).*$'
    parts = re.split(split_pattern, generated_text)
    clean_text = parts[0] if parts else generated_text
    clean_text = re.sub(r'^[ \t]+', '', generated_text, flags=re.MULTILINE)

    cited_indices = set()
    matches = re.findall(r'\[(\d+)]', clean_text)
    for m in matches:
        cited_indices.add(int(m))

    markdown_report = "## Generated Academic Text\n\n"
    markdown_report += clean_text.strip() + "\n\n"
    markdown_report += "---\n\n"

    grouped_sources = defaultdict(list)
    for i, src in enumerate(used_sources, 1):
        if i not in cited_indices:
            continue

        title = src.get('title', 'Unknown Title')
        grouped_sources[title].append({
            "id": i,
            "section": src.get('section', 'General'),
            "distance": src.get('distance'),
            "authors": src.get('authors'),
            "year": src.get("year"),
            "url": src.get("url"),
            "references": src.get('references', [])
        })

    if not grouped_sources:
        markdown_report += "> *No sources were directly cited in the text.*"
    else:
        markdown_report += "## Bibliography & Source References\n\n"

        for title, chunks in grouped_sources.items():
            first_chunk = chunks[0]
            full_authors = get_full_author_list(first_chunk['authors'])

            safe_title = sanitize_table_cell(title)
            safe_authors = sanitize_table_cell(full_authors)

            year = first_chunk['year'] or "n.d."
            url = first_chunk['url'] or "n.d."

            source_ids = ", ".join([str(c['id']) for c in chunks])

            markdown_report += f"### [Source {source_ids}] {safe_title}\n"
            markdown_report += f"**Authors:** *{full_authors}* ({year})\n"
            markdown_report += f"**Url:** *{url}*\n\n"

            markdown_report += "| Ref ID | Section Used | Key Citations Inside | Score |\n"
            markdown_report += "| :---: | :--- | :--- | :---: |\n"

            for c in chunks:
                score = 1 - c['distance']
                safe_section = sanitize_table_cell(c['section'])

                inner_refs_display = "-"
                if c['references']:
                    refs_list = []
                    for r in c['references']:
                        rid = r.get('id')
                        rtext = clean_ref_text(r.get('text', ''))
                        safe_rtext = sanitize_table_cell(rtext)
                        refs_list.append(f"• [{rid}] {safe_rtext}")

                    inner_refs_display = "<br>".join(refs_list)

                markdown_report += f"| **[{c['id']}]** | {safe_section} | {inner_refs_display} | **{score:.2f}** |\n"

            markdown_report += "\n<br>\n"

    display(Markdown(markdown_report))

In [None]:
# @title
query = "What is the primary research function of the BacPrep platform and what model does it currently employ?"

clean_and_display_report(query, 10)





In [None]:
import re
from collections import defaultdict
from IPython.display import Markdown, display

def clean_and_display_report_qwen(query, use_rag=True, top_k=5):
    """
    Raporu oluşturur ve görüntüler.
    - use_rag=True: Detaylı kaynak tablosuyla birlikte rapor basar.
    - use_rag=False: Sadece baseline cevabı basar.
    """

    # ARTIK QUERY_QWEN ÇAĞIRIYORUZ
    generated_text, used_sources = query_qwen(query, use_rag=use_rag, top_k=top_k)

    # Metin temizleme (Referans başlıklarını uçurma)
    split_pattern = r'(?i)\n\s*(References|Bibliography|Sources|Studies cited within).*$'
    parts = re.split(split_pattern, generated_text)
    clean_text = parts[0] if parts else generated_text
    clean_text = re.sub(r'^[ \t]+', '', clean_text, flags=re.MULTILINE)

    # Başlık
    mode_title = "RAG Augmented Response (Qwen)" if use_rag else "Baseline Response (Qwen)"
    markdown_report = f"## {mode_title}\n\n"
    markdown_report += clean_text.strip() + "\n\n"
    markdown_report += "---\n\n"

    # --- RAG KAPALIYSA BURADA BİTİR ---
    if not use_rag or not used_sources:
        display(Markdown(markdown_report))
        return

    # --- RAG AÇIKSA TABLOYU OLUŞTUR ---
    cited_indices = set()
    matches = re.findall(r'\[(\d+)]', clean_text)
    for m in matches:
        cited_indices.add(int(m))

    grouped_sources = defaultdict(list)
    for i, src in enumerate(used_sources, 1):
        # Sadece metinde atıf yapılanları listele
        if i not in cited_indices:
            continue

        title = src.get('title', 'Unknown Title')
        grouped_sources[title].append({
            "id": i,
            "section": src.get('section', 'General'),
            "distance": src.get('distance'),
            "authors": src.get('authors'),
            "year": src.get("year"),
            "url": src.get("url"),
            "references": src.get('references', [])
        })

    if not grouped_sources:
        markdown_report += "> *No sources were directly cited in the text although RAG was active.*"
    else:
        markdown_report += "## References\n\n"

        for title, chunks in grouped_sources.items():
            first_chunk = chunks[0]
            full_authors = get_full_author_list(first_chunk['authors'])

            safe_title = sanitize_table_cell(title)
            safe_authors = sanitize_table_cell(full_authors)
            year = first_chunk['year'] or "n.d."
            url = first_chunk['url'] or "n.d."
            source_ids = ", ".join([str(c['id']) for c in chunks])

            markdown_report += f"### [Source {source_ids}] {safe_title}\n"
            markdown_report += f"**Authors:** *{full_authors}* ({year})\n"
            markdown_report += f"**Url:** *{url}*\n\n"

            markdown_report += "| Ref ID | Section Used | Key Citations Inside | Score |\n"
            markdown_report += "| :---: | :--- | :--- | :---: |\n"

            for c in chunks:
                score = 1 - c['distance']
                safe_section = sanitize_table_cell(c['section'])

                inner_refs_display = "-"
                if c['references']:
                    refs_list = []
                    for r in c['references']:
                        rid = r.get('id')
                        rtext = clean_ref_text(r.get('text', ''))
                        safe_rtext = sanitize_table_cell(rtext)
                        refs_list.append(f"• [{rid}] {safe_rtext}")
                    inner_refs_display = "<br>".join(refs_list)

                markdown_report += f"| **[{c['id']}]** | {safe_section} | {inner_refs_display} | **{score:.2f}** |\n"

            markdown_report += "\n<br>\n"

    display(Markdown(markdown_report))

In [None]:
# query = "what is the advantages of transformers to RNN's"
# query = "Are LLM's better GNN's"
# 1. BASELINE QWEN (Veritabanı yok, sadece Qwen'in bilgisi)
# print(">>> RUNNING BASELINE MODE...")
# clean_and_display_report_qwen(query, use_rag=False)

# 2. RAG QWEN (Veritabanı var, kaynaklar ve tablo var)
# print("\n>>> RUNNING RAG MODE...")
clean_and_display_report_qwen(query, use_rag=True, top_k=20)

Retrieving top 20 contexts for: 'Are LLM's better GNN's'...
Generating Qwen response (RAG)...


## RAG Augmented Response (Qwen)

## LLMs vs. GNNs in Graph Learning

Recent studies have explored the integration of Large Language Models (LLMs) with Graph Neural Networks (GNNs) to enhance robust graph learning under various conditions. The empirical analysis conducted in the work by Wang et al. (2025) reveals that LLM augmentations often fall behind simpler GNN counterparts under modest deficiencies. Specifically, both LLM-as-Encoder and LLM-as-Enhancer paradigms exhibit comparable or worse accuracy than traditional GNN-based methods (Sources [1], [2]). This observation underscores the limitations of LLMs in providing semantically diverse and coherent augmentations, which are crucial for improving the discriminative capacity of GNNs. The authors attribute these limitations to the inherent semantic homogeneity in LLM-generated content, leading to high intra-class variance and small inter-class margins in learned representations (Source [17]).

In contrast, GraphLM, as described in another study, integrates structural features with broader semantic understanding, thereby addressing some of the limitations of GNNs and GraphTransformers. GraphLM is noted for its ability to handle long-range dependencies and hierarchical structures more comprehensively, making it a promising alternative for tasks requiring deep graph comprehension (Source [15]).

While the current evidence suggests that simpler GNN-based methods often outperform LLM-enhanced approaches under less severe deficiencies, the potential of LLMs in more complex scenarios remains an open area of research. Further investigation is needed to explore how LLMs can be optimized to better complement GNNs in specific applications.

*Next Step*: Would you like to explore the specific limitations of LLMs in detail or delve into the potential synergies between LLMs and GNNs in more complex tasks?

---

## References

### [Source 1, 2, 17] Are LLMs Better GNN Helpers? Rethinking Robust Graph Learning under Deficiencies with Iterative Refinement
**Authors:** *Zhaoyan Wang, Zheng Gao, Arogya Kharel, In-Young Ko* (2025)
**Url:** *https://arxiv.org//pdf/2510.01910*

| Ref ID | Section Used | Key Citations Inside | Score |
| :---: | :--- | :--- | :---: |
| **[1]** | Empirical Analysis | - | **0.63** |
| **[2]** | Empirical Analysis | - | **0.63** |
| **[17]** | Methodology 4.1 System Overview | - | **0.59** |

<br>
### [Source 15] Beyond Textual Context: Structural Graph Encoding with Adaptive Space Alignment to alleviate the hallucination of LLMs
**Authors:** *Yifang Zhang, Pengfei Duan, Yiwen Yang, Shengwu Xiong* (2025)
**Url:** *https://arxiv.org//pdf/2509.22251*

| Ref ID | Section Used | Key Citations Inside | Score |
| :---: | :--- | :--- | :---: |
| **[15]** | Why structural information of KGs is important and must not be overlooked? | - | **0.59** |

<br>


#**METRICS**

In [None]:
!pip install ragas datasets langchain

Collecting ragas
  Downloading ragas-0.4.1-py3-none-any.whl.metadata (22 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.13.0-py3-none-any.whl.metadata (11 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-1.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting jiter<1,>=0.10.0 (from openai>=1.0.0->ragas)
  Downloading jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pre-commit>=4.3.0 (from instructor->ragas)
  Downloading pre_co

In [None]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.2.0


In [None]:
import json
from tqdm import tqdm
import contextlib
import io

# Test verisini yükle
try:
    test_data
except NameError:
    with open("/content/drive/MyDrive/NLP/codes/data/test_data.json", "r", encoding="utf-8") as f:
        test_data = json.load(f)

# --- TOKEN VE FORMAT KISITLAMASI ---
# Modele kesin bir dille 3 paragraf sınırı koyuyoruz.
constraint_suffix = "\n\nCRITICAL INSTRUCTION: Answer this question in EXACTLY 3 paragraphs. Ensure the answer is concise and directly addresses the prompt."

rag_results = []
baseline_results = []

print("🚀 Veri üretimi (Generator: Qwen, Kısıt: 3 Paragraf) başlıyor...")

# Print çıktılarını gizlemek için context manager
with contextlib.redirect_stdout(io.StringIO()):
    for item in tqdm(test_data, desc="Sorular İşleniyor"):
        original_query = item["question"]

        # Sorguyu modifiye et (Constraint ekle)
        modified_query = f"{original_query}{constraint_suffix}"

        # --- A) RAG Modu ---
        try:
            rag_answer, rag_contexts_dicts = query_qwen(modified_query, use_rag=True, top_k=5)
            rag_context_texts = [ctx['text'] for ctx in rag_contexts_dicts]

            rag_results.append({
                "question": original_query,
                "answer": rag_answer,
                "contexts": rag_context_texts,
                "ground_truth": item.get("ground_truth", "")
            })
        except Exception as e:
            pass

        # --- B) Baseline Modu ---
        try:
            base_answer, _ = query_qwen(modified_query, use_rag=False)

            baseline_results.append({
                "question": original_query,
                "answer": base_answer,
                "contexts": [],
                "ground_truth": item.get("ground_truth", "")
            })
        except Exception as e:
            pass

# Verileri JSON olarak kaydet
output_file = "/content/drive/MyDrive/NLP/codes/qwen_constrained_results_for_mistral_eval.json"
combined_data = {
    "rag": rag_results,
    "baseline": baseline_results
}

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=4)

print(f"\n✅ Veri üretimi tamamlandı: {output_file}")

Sorular İşleniyor:   0%|          | 0/50 [00:00<?, ?it/s]

🚀 Veri üretimi (Generator: Qwen, Kısıt: 3 Paragraf) başlıyor...


Sorular İşleniyor:  56%|█████▌    | 28/50 [12:31<09:50, 26.83s/it]


KeyboardInterrupt: 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
import pandas as pd
import json
import gc

# --- 1. MISTRAL JUDGE YÜKLEME ---
print("⚖️ Jüri Modeli (Mistral 7B) Yükleniyor...")

# BURAYI DÜZELTTİK: Qwen yerine Mistral ID'si
JUDGE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

try:
    judge_tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL_ID, trust_remote_code=True)
    judge_model = AutoModelForCausalLM.from_pretrained(
        JUDGE_MODEL_ID,
        dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
except Exception as e:
    print(f"Model yüklenirken hata oluştu: {e}")
    # Eğer gated model hatası alırsan HF_TOKEN secret'ını kontrol etmelisin.

# Ragas için Pipeline (Repetition Penalty Eklendi)
judge_pipe = pipeline(
    "text-generation",
    model=judge_model,
    tokenizer=judge_tokenizer,
    max_new_tokens=1024,          # 512 bazen JSON’u yarıda kesiyor
    do_sample=False,              # kritik: JSON formatı için
    temperature=0.0,
    top_p=1.0,
    repetition_penalty=1.0,
    eos_token_id=judge_tokenizer.eos_token_id,
    pad_token_id=judge_tokenizer.eos_token_id,
    return_full_text=False
)

mistral_judge = HuggingFacePipeline(pipeline=judge_pipe)
print("✅ Mistral Judge Hazır.")

# --- 2. EMBEDDING MODELİ ---
embeddings_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

# --- 3. VERİYİ YÜKLE ---
# Dosya yolunu kendi yoluna göre güncelle
INPUT_FILE = "/content/drive/MyDrive/NLP/codes/data/qwen_constrained_results_for_mistral_eval.json"
OUTPUT_FILE = "/content/drive/MyDrive/NLP/codes/data/final_metrics_judge_mistral.csv"

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

rag_dataset = Dataset.from_list(data["rag"])
baseline_dataset = Dataset.from_list(data["baseline"])

# --- 4. DEĞERLENDİRME FONKSİYONU ---
def evaluate_with_mistral(dataset, metric_list, name):
    print(f"\n--- {name} Değerlendirmesi Başlıyor (Jüri: Mistral) ---")
    results = []

    for i in range(len(dataset)):
        sample = dataset.select([i])
        try:
            score = evaluate(
                dataset=sample,
                metrics=metric_list,
                llm=mistral_judge,
                embeddings=embeddings_model,
                raise_exceptions=True # Hata olursa görelim (artık parametrelerle düzelttik)
            )
            results.append(score.to_pandas())
            print(".", end="", flush=True)
        except Exception as e:
            print("x", end="", flush=True)
            # Hata mesajını kısaltarak bas, ekranı doldurmasın
            print(f"\n[HATA - {name} Index {i}]: {str(e)[:100]}...")

    if results:
        return pd.concat(results, ignore_index=True)
    return pd.DataFrame()

# --- 5. ÇALIŞTIR ---

# RAG Değerlendirmesi
rag_df = evaluate_with_mistral(rag_dataset, [faithfulness, answer_relevancy], "RAG")
if not rag_df.empty:
    rag_df["mode"] = "RAG"

# Baseline Değerlendirmesi
baseline_df = evaluate_with_mistral(baseline_dataset, [answer_relevancy], "Baseline")
if not baseline_df.empty:
    baseline_df["mode"] = "Baseline"

# --- 6. RAPORLAMA ---
if not rag_df.empty or not baseline_df.empty:
    final_df = pd.concat([rag_df, baseline_df], ignore_index=True)
    final_df.to_csv(OUTPUT_FILE, index=False)

    print("\n\n=== SONUÇ ÖZETİ (Jüri: Mistral) ===")
    if 'mode' in final_df.columns:
        summary = final_df.groupby("mode")[["faithfulness", "answer_relevancy"]].mean()
        display(summary)
    else:
        print(final_df.mean(numeric_only=True))

    print(f"\n✅ Sonuçlar '{OUTPUT_FILE}' dosyasına kaydedildi.")
else:
    print("❌ Hiçbir değerlendirme tamamlanamadı.")