!pip install PyPDF2 rank_bm25 sentence-transformers faiss-cpu scikit-learn openai

In [None]:
# Dependencies
from google.colab import files
import PyPDF2, re, numpy as np, faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import os
from itertools import chain
import openai

In [None]:
# Upload PDF
uploaded = files.upload()
pdf_files = list(uploaded.keys())
print(f"A total of {len(pdf_files)} PDF(s) uploaded")

# 1. Document-level segmentation

In [None]:
# ================== 1. Upload PDF =================
# Set the folder path
folder_path = "/content"

# Get all PDF filenames
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
print(f"A total of {len(pdf_files)} PDF file(s) found")

In [None]:
# ============ 2. Text Cleaning + Document-level Concatenation =============
def clean_line(s: str) -> str:
    """Remove hyphenated line breaks & clean multiple spaces"""
    s = re.sub(r'-\s*\n', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def is_author_line(line: str) -> bool:
    """If line contains ≥2 English names and lacks predicates, treat as author line"""
    return len(re.findall(r'[A-Z][a-z]+\s+[A-Z][a-z]+', line)) >= 2 and \
           not re.search(r'\b(is|was|were|are|has|have)\b', line, re.I)

def is_metadata_line(line: str) -> bool:
    """Filter out copyright, journal info, keyword lists, and other irrelevant lines"""
    if re.search(r'(Elsevier|Springer|doi|ISSN|eISSN|Published|Available online|ScienceDirect|'
                 r'Correspondence|Open Access|Author information|Received|Accepted|All rights reserved|'
                 r'Journal|Volume|Issue|Editor|University|Department|Faculty|Copyright)', line, re.I):
        return True
    if re.search(r'(ARTICLE INFO|Keywords|ABSTRACT|Article history|Resources Policy)', line, re.I):
        return True
    if is_author_line(line):
        return True
    if len(line.split()) >= 8 and not re.search(
        r'\b(is|was|were|are|has|have|using|used|based|conducted|shows|analyze|explore|assess|'
        r'estimate|report|evaluate|demonstrate)\b', line, re.I):
        return True
    return False

def merge_lines(lines):
    """Merge multiple lines into natural paragraphs to reduce sentence break noise"""
    merged, buf = [], ''
    for ln in lines:
        if not buf:
            buf = ln
        else:
            if not re.search(r'[.!?。！？]$', buf):
                buf += ' ' + ln
            else:
                merged.append(buf)
                buf = ln
    if buf: merged.append(buf)
    return merged


doc_texts, doc_files = [], []

for file in pdf_files:
    with open(file, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        raw_lines = []

        for pg in reader.pages:
            raw = pg.extract_text() or ''
            for ln in raw.split('\n'):
                ln = clean_line(ln)
                if ln and not is_metadata_line(ln):
                    raw_lines.append(ln)

        # Concatenate all natural paragraphs into one long text
        paragraphs = merge_lines(raw_lines)
        long_text  = ' '.join(paragraphs).strip()

        if long_text:          # filter out empty documents
            doc_texts.append(long_text)
            doc_files.append(file)

assert doc_texts, " No main text extracted, please check the PDFs."
print(" Cleaning complete, number of documents:", len(doc_texts))


In [None]:
# ================== 3. Building Index ==================
# --- A. TF-IDF + Cosine ---
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
tfidf_mat  = vectorizer.fit_transform(doc_texts)
print(" A: TF-IDF index is ready")

# --- B. BM25 ---
bm25 = BM25Okapi([doc.lower().split() for doc in doc_texts])
print(" B: BM25 index is ready")

# --- C. SBERT + FAISS ---
sbert = SentenceTransformer('all-MiniLM-L6-v2')
embs  = sbert.encode(doc_texts, normalize_embeddings=True, show_progress_bar=False)
index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs.astype('float32'))
print(" C: SBERT embeddings + FAISS index is ready")

In [None]:
# ================== 4. Retrieval Functions ==================
def retrieve_A(q, k=3):
    sims = cosine_similarity(vectorizer.transform([q]), tfidf_mat).flatten()
    idx  = sims.argsort()[::-1][:k]
    return [(doc_files[i], doc_texts[i], float(sims[i])) for i in idx]

def retrieve_B(q, k=3):
    scores = bm25.get_scores(q.lower().split())
    idx    = np.argsort(scores)[::-1][:k]
    return [(doc_files[i], doc_texts[i], float(scores[i])) for i in idx]

def retrieve_C(q, k=3):
    q_emb = sbert.encode([q], normalize_embeddings=True)
    sims, idx = index.search(q_emb.astype('float32'), k)
    return [(doc_files[i], doc_texts[i], float(sims[0][j])) for j, i in enumerate(idx[0])]


## 1.1 GPT-3.5

In [None]:
# ================== 5. GPT Generation ==================
client = OpenAI(api_key="")

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars:
            break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    #  System prompt + user prompt structure
    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return rsp.choices[0].message.content


def gen_no_rag(query):
    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"user","content":query}],
        temperature=0
    )
    return rsp.choices[0].message.content


In [None]:
# ========= 6. Hybrid-RAG Construction =========
def merge_docs(*doc_lists, top_k=6, max_chars=1200):
    """Merge multiple retrieval results and truncate uniformly"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt[:256])
            cache[key] = max(cache.get(key, -1), sc)

    merged = sorted([(fn, txt[:max_chars], sc)
                     for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]


def gen_hybrid_rag(query, *doc_lists):
    """Generate final answer by augmenting a No-RAG draft with multi-source evidence"""
    # ① Base draft from No-RAG
    draft = gen_no_rag(query)

    # ② Collect evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Let GPT augment draft with evidence, adding citations
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )
    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"system","content":system_prompt},
                  {"role":"user","content":user_prompt}],
        temperature=0
    )
    return rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = "What monitoring techniques are suitable for measuring PM2.5?"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment A (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment B (BM25) ——\n", ans_B, "\n")
print("—— Experiment C (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment D (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")


## 1.2 DeepSeek-CHAT

In [None]:
# ================== 5. DeepSeek Generation ==================
client = OpenAI(api_key="", base_url="https://api.deepseek.com")

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars: break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    #  System prompt + user prompt structure
    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return rsp.choices[0].message.content


def gen_no_rag(query):
    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role":"user","content":query}],
        temperature=0
    )
    return rsp.choices[0].message.content

In [None]:
# ========= 6. Hybrid-RAG Construction =========
def merge_docs(*doc_lists, top_k=6, max_chars=1200):
    """Merge multi-source retrieval results and truncate text"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt[:256])
            cache[key] = max(cache.get(key, -1), sc)

    merged = sorted([(fn, txt[:max_chars], sc)
                     for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]


def gen_hybrid_rag(query, *doc_lists):
    """Hybrid-RAG: No-RAG draft + evidence augmentation"""

    # ① Obtain No-RAG draft
    draft = gen_no_rag(query)

    # ② Merge evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Enhance draft using evidence
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )
    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role":"system","content":system_prompt},
                  {"role":"user","content":user_prompt}],
        temperature=0
    )
    return rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = " What monitoring techniques are suitable for measuring PM2.5？"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment (BM25) ——\n", ans_B, "\n")
print("—— Experiment (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")


## 1.3 LLaMA-3-8b

In [None]:
# ================== 5. LLaMA Generation ==================

client = openai.OpenAI(
    api_key="",
    base_url="https://openrouter.ai/api/v1"
)

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars: break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    response = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content


def gen_no_rag(query):
    response = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[{"role": "user", "content": query}],
        temperature=0.2
    )
    return response.choices[0].message.content


In [None]:
# ========= 6. Hybrid-RAG Construction =========

def merge_docs(*doc_lists, top_k=6, max_chars=1200):
    """Merge multi-source retrieval results and truncate text"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt[:256])
            cache[key] = max(cache.get(key, -1), sc)

    merged = sorted([(fn, txt[:max_chars], sc)
                     for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]


def gen_hybrid_rag(query, *doc_lists):
    """Hybrid-RAG: No-RAG draft + evidence augmentation"""

    # ① Obtain No-RAG draft
    draft_rsp = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",  # or llama-3-70b-instruct
        messages=[{"role": "user", "content": query}],
        temperature=0.2
    )
    draft = draft_rsp.choices[0].message.content

    # ② Merge evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Enhance draft using evidence
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )

    enhanced_rsp = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )

    return enhanced_rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = "What is a Clean Air Zone and how is it implemented in the UK?"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment A (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment B (BM25) ——\n", ans_B, "\n")
print("—— Experiment C (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment D (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")

# 2. Paragraph-level segmentation

In [None]:
# ============ 1. Set folder path =============
folder_path = "/content"

# Get all PDF file names
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files in total")

In [None]:
# ============ 2. Text Cleaning + Paragraph Segmentation =============

def clean_line(s: str) -> str:
    s = re.sub(r'-\s*\n', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def is_author_line(line: str) -> bool:
    return len(re.findall(r'[A-Z][a-z]+\s+[A-Z][a-z]+', line)) >= 2 and \
           not re.search(r'\b(is|was|were|are|has|have)\b', line, re.I)

def is_metadata_line(line: str) -> bool:
    if re.search(r'(Elsevier|Springer|doi|ISSN|eISSN|Published|Available online|ScienceDirect|'
                 r'Correspondence|Open Access|Author information|Received|Accepted|All rights reserved|'
                 r'Journal|Volume|Issue|Editor|University|Department|Faculty|Copyright)', line, re.I):
        return True
    if re.search(r'(ARTICLE INFO|Keywords|ABSTRACT|Article history|Resources Policy)', line, re.I):
        return True
    if is_author_line(line):
        return True
    # Lines with many words but no verbs
    if len(line.split()) >= 8 and not re.search(
        r'\b(is|was|were|are|has|have|using|used|based|conducted|shows|analyze|explore|assess|estimate|report|evaluate|demonstrate)\b',
        line, re.I):
        return True
    return False

def merge_lines(lines):
    merged, buf = [], ''
    for ln in lines:
        if not buf:
            buf = ln
        else:
            if not re.search(r'[.!?。！？]$', buf):
                buf += ' ' + ln
            else:
                merged.append(buf)
                buf = ln
    if buf:
        merged.append(buf)
    return merged

para_texts, para_files = [], []

for file in pdf_files:
    with open(file, 'rb') as f:
        rd, raw_lines = PyPDF2.PdfReader(f), []
        for pg in rd.pages:
            raw = pg.extract_text() or ''
            for ln in raw.split('\n'):
                ln = clean_line(ln)
                if ln and not is_metadata_line(ln):
                    raw_lines.append(ln)
        for para in merge_lines(raw_lines):
            if len(para.split()) >= 20:        # Filter very short paragraphs
                para_texts.append(para)
                para_files.append(file)

assert para_texts, " No valid paragraphs extracted"
print(" Cleaning complete, number of paragraphs:", len(para_texts))


In [None]:
# ================== 3. Build Index ==================

# --- A. TF-IDF + Cosine ---
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
tfidf_mat  = vectorizer.fit_transform(para_texts)
print(" A: TF-IDF index is ready")

# --- B. BM25 ---
bm25 = BM25Okapi([p.lower().split() for p in para_texts])
print(" B: BM25 index is ready")

# --- C. SBERT + FAISS ---
sbert = SentenceTransformer('all-MiniLM-L6-v2')
embs  = sbert.encode(para_texts, normalize_embeddings=True, show_progress_bar=False)
index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs.astype('float32'))
print("C: SBERT embeddings + FAISS index is ready")


In [None]:
# ================== 4. Retrieval Functions ==================
def retrieve_A(q, k=3):
    sims = cosine_similarity(vectorizer.transform([q]), tfidf_mat).flatten()
    idx  = sims.argsort()[::-1][:k]
    return [(para_files[i], para_texts[i], float(sims[i])) for i in idx]

def retrieve_B(q, k=3):
    scores = bm25.get_scores(q.lower().split())
    idx    = np.argsort(scores)[::-1][:k]
    return [(para_files[i], para_texts[i], float(scores[i])) for i in idx]

def retrieve_C(q, k=3):
    q_emb = sbert.encode([q], normalize_embeddings=True)
    sims, idx = index.search(q_emb.astype('float32'), k)
    return [(para_files[i], para_texts[i], float(sims[0][j])) for j, i in enumerate(idx[0])]


## GPT-3.5

In [None]:
# ================== 5. GPT Generation ==================
client = OpenAI(api_key="")

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars:
            break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    #  System prompt + user prompt structure
    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return rsp.choices[0].message.content


def gen_no_rag(query):
    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"user","content":query}],
        temperature=0
    )
    return rsp.choices[0].message.content


In [None]:
# ========= 6. Hybrid-RAG Construction =========
def merge_docs(*doc_lists, top_k=6):
    """Merge multi-source retrieval results, deduplicate, and return top_k by score"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt)
            cache[key] = max(cache.get(key, -1), sc)
    merged = sorted([(fn, txt, sc) for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]

def gen_hybrid_rag(query, *doc_lists):
    """Generate Hybrid-RAG answer: No-RAG draft + evidence augmentation"""
    # ① Obtain No-RAG draft
    draft = gen_no_rag(query)

    # ② Merge evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Enhance draft using evidence
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )
    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"system","content":system_prompt},
                  {"role":"user","content":user_prompt}],
        temperature=0
    )
    return rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = "What monitoring techniques are suitable for measuring PM2.5?"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment A (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment B (BM25) ——\n", ans_B, "\n")
print("—— Experiment C (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment D (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")


## 2.2 DeepSeek-CHAT

In [None]:
# ================== 5. Deepseek Generation ==================
client = OpenAI(api_key="", base_url="https://api.deepseek.com")

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars:
            break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    #  System prompt + user prompt structure
    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    return rsp.choices[0].message.content


def gen_no_rag(query):
    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role":"user","content":query}],
        temperature=0
    )
    return rsp.choices[0].message.content


In [None]:
# ========= 6. Hybrid-RAG Construction =========
def merge_docs(*doc_lists, top_k=6, max_chars=1200):
    """Merge multi-source retrieval results with truncation"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt[:256])
            cache[key] = max(cache.get(key, -1), sc)

    merged = sorted([(fn, txt[:max_chars], sc)
                     for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]


def gen_hybrid_rag(query, *doc_lists):
    """Generate Hybrid-RAG answer: No-RAG draft + evidence augmentation"""
    # ① Obtain No-RAG draft
    draft = gen_no_rag(query)

    # ② Merge evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Enhance draft using evidence
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )
    rsp = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role":"system","content":system_prompt},
                  {"role":"user","content":user_prompt}],
        temperature=0
    )
    return rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = "What monitoring techniques are suitable for measuring PM2.5?"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment A (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment B (BM25) ——\n", ans_B, "\n")
print("—— Experiment C (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment D (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")

## 2.3 LLaMa-3-8b

In [None]:
# ================== 5. LLaMa Generation ==================

client = openai.OpenAI(
    api_key="",
    base_url="https://openrouter.ai/api/v1"
)

def gen_with_ctx(query, docs, max_tokens=12000):
    max_chars, acc, ctx = max_tokens * 4, 0, []
    for _, d, _ in docs:
        if acc >= max_chars: break
        chunk = d[:max_chars - acc]
        ctx.append(chunk)
        acc += len(chunk)

    ctx_joined = "\n\n".join(ctx)

    system_prompt = (
        "You are an expert assistant in environmental policy research. "
        "When answering questions, do not refer to specific papers using phrases like 'this study' or 'the paper'. "
        "Instead, synthesize the content in an abstract, generalized manner, describing methods and findings without attributing them to individual sources."
    )

    user_prompt = (
        f"The following are excerpts from multiple environmental policy documents:\n\n"
        f"{ctx_joined}\n\n"
        f"Based on the information above, answer the following question in clear and concise academic English:\n\n{query}"
    )

    response = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content


def gen_no_rag(query):
    response = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[{"role": "user", "content": query}],
        temperature=0.2
    )
    return response.choices[0].message.content


In [None]:
# ========= 6. Hybrid-RAG Construction =========

def merge_docs(*doc_lists, top_k=6, max_chars=1200):
    """Merge multi-source retrieval results with truncation"""
    cache = {}
    for docs in doc_lists:
        for fn, txt, sc in docs:
            key = (fn, txt[:256])  # Use prefix to avoid duplicates
            cache[key] = max(cache.get(key, -1), sc)

    merged = sorted([(fn, txt[:max_chars], sc)
                     for (fn, txt), sc in cache.items()],
                    key=lambda x: x[2], reverse=True)
    return merged[:top_k]


def gen_hybrid_rag(query, *doc_lists):
    """Hybrid-RAG: No-RAG draft + evidence augmentation"""

    # ① Obtain No-RAG draft
    draft_rsp = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",  # or llama-3-70b-instruct
        messages=[{"role": "user", "content": query}],
        temperature=0.2
    )
    draft = draft_rsp.choices[0].message.content

    # ② Merge evidence paragraphs
    docs = merge_docs(*doc_lists)
    evidence_txt = "\n\n".join(f"[{i}] {d}" for i, (_, d, _) in enumerate(docs, 1))

    # ③ Enhance draft with evidence
    system_prompt = (
        "You are an expert environmental-policy assistant. "
        "Take the DRAFT answer the user already wrote, KEEP its structure, "
        "but augment it with precise facts drawn from the EVIDENCE below. "
        "Cite the evidence numbers (e.g. [1]) at relevant places. "
        "If draft statements conflict with evidence, correct them."
    )
    user_prompt = (
        f"DRAFT ANSWER:\n{draft}\n\n"
        f"EVIDENCE:\n{evidence_txt}\n\n"
        f"Please return the enhanced answer."
    )

    enhanced_rsp = client.chat.completions.create(
        model="meta-llama/llama-3-8b-instruct",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )

    return enhanced_rsp.choices[0].message.content, docs


In [None]:
# ================== 7. Example Run ==================
query = "What is a Clean Air Zone and how is it implemented in the UK?"

docs_A, ans_A = retrieve_A(query), gen_with_ctx(query, retrieve_A(query))
docs_B, ans_B = retrieve_B(query), gen_with_ctx(query, retrieve_B(query))
docs_C, ans_C = retrieve_C(query), gen_with_ctx(query, retrieve_C(query))
ans_D         = gen_no_rag(query)

print("—— Experiment A (TF-IDF) ——\n", ans_A, "\n")
print("—— Experiment B (BM25) ——\n", ans_B, "\n")
print("—— Experiment C (SBERT+FAISS) ——\n", ans_C, "\n")
print("—— Experiment D (No-RAG) ——\n", ans_D)

# —— Experiment E (Hybrid-RAG) ——
ans_E, docs_E = gen_hybrid_rag(query, docs_A, docs_B, docs_C)
print("—— Experiment E (Hybrid-RAG) ——\n", ans_E)

show_sources(docs_E, "E")


In [None]:
# ================== 8. Display Source Excerpts ==================
def show_sources(docs, label):
    print(f"\n===== Source Excerpts {label} =====")
    for i, (fn, txt, sc) in enumerate(docs, 1):
        print(f"\n[{i}] {fn} | Score: {sc:.3f}\n{txt}\n")

show_sources(docs_A, "A")
show_sources(docs_B, "B")
show_sources(docs_C, "C")
