In [4]:
# =========================
# IR2025 ‚Äî Phase 1 Baseline (ElasticSearch + BM25) ‚Äî ALL IN ONE
# Includes:
#  - Preprocess documents + queries
#  - Create index (BM25)
#  - BULK indexing
#  - Run files for k=20/30/50 (trec format)
#  - Optional trec_eval
# =========================

# !pip -q install elasticsearch tqdm pandas

from pathlib import Path
import pandas as pd
import re
import os
import subprocess
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
from elasticsearch.exceptions import NotFoundError

Œë1: Œ†œÅŒøŒµœÄŒµŒæŒµœÅŒ≥Œ±œÉŒØŒ± Œ¥ŒµŒ¥ŒøŒºŒ≠ŒΩœâŒΩ (documents + queries)

In [5]:
# -------------------------
# 1) Paths
# -------------------------
DATA_DIR = Path(input("ŒîœéœÉŒµ path œÑŒøœÖ œÜŒ±Œ∫Œ≠ŒªŒøœÖ IR2025 (documents.csv, queries.csv, qrels.txt): ").strip().strip('"').strip("'"))
OUT_DIR  = Path(input("ŒîœéœÉŒµ path œÜŒ±Œ∫Œ≠ŒªŒøœÖ output: ").strip().strip('"').strip("'"))
OUT_DIR.mkdir(parents=True, exist_ok=True)

DOCS_CSV    = DATA_DIR / "documents.csv"
QUERIES_CSV = DATA_DIR / "queries.csv"
QRELS_TXT   = DATA_DIR / "qrels.txt"

print("\n[INFO] DATA_DIR:", DATA_DIR.resolve())
print("[INFO] OUT_DIR :", OUT_DIR.resolve())

# -------------------------
# TEXT PREPROCESS
# -------------------------
def clean_text(s):
    s = "" if pd.isna(s) else str(s)
    s = re.sub(r"[\r\n\t]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# -------------------------
# Load + PREPROCESS documents
# -------------------------
df_docs = pd.read_csv(DOCS_CSV, encoding="utf-8")
df_docs = df_docs.rename(columns={"ID": "doc_id", "Text": "text"})
assert "doc_id" in df_docs.columns and "text" in df_docs.columns, f"Docs columns: {df_docs.columns}"

df_docs["doc_id"] = df_docs["doc_id"].astype(str).str.strip()
df_docs["text"]   = df_docs["text"].map(clean_text)

print("\n[DOCS] Sample after preprocess:")
display(df_docs.head(3))
print("[DOCS] Count:", len(df_docs))

# -------------------------
# Load + PREPROCESS queries
# -------------------------
df_q = pd.read_csv(QUERIES_CSV, encoding="utf-8-sig")
df_q = df_q.rename(columns={"ID": "qid", "Text": "query"})
assert "qid" in df_q.columns and "query" in df_q.columns, f"Queries columns: {df_q.columns}"

df_q["qid"]   = df_q["qid"].astype(str).str.strip()
df_q["query"] = df_q["query"].map(clean_text)

print("\n[QUERIES] Sample after preprocess:")
display(df_q.head(5))
print("[QUERIES] Count:", len(df_q))



[INFO] DATA_DIR: C:\Users\ArisK\Desktop\IR20252026
[INFO] OUT_DIR : C:\Users\ArisK\Desktop\dawdawdawd

[DOCS] Sample after preprocess:


Unnamed: 0,doc_id,text
0,193157,Support towards the Europe PMC initiative-Cont...
1,193158,Support to the Vice-Presidents of the ERC Scie...
2,193159,Implementation of activities described in the ...


[DOCS] Count: 18316

[QUERIES] Sample after preprocess:


Unnamed: 0,qid,query
0,Q01,EUTRAVEL Optimodal European Travel Ecosystem E...
1,Q02,Track And Know Big Data for Mobility Tracking ...
2,Q03,"SELIS, Towards a Shared European Logistics Int..."
3,Q04,TYPHON Polyglot and Hybrid Persistence Archite...
4,Q05,CHARIOT Cognitive Heterogeneous Architecture f...


[QUERIES] Count: 10


Œë2: ŒîŒ∑ŒºŒπŒøœÖœÅŒ≥ŒØŒ± ŒµœÖœÅŒµœÑŒ∑œÅŒØŒøœÖ ElasticSearch (BM25) + Bulk indexing

In [6]:
# -------------------------
# Connect ES (HTTP default)
# -------------------------
ES_URL = os.getenv("ES_URL", "http://localhost:9200")
es = Elasticsearch(ES_URL)

info = es.info()
print("\n[OK] Elasticsearch:", info.get("cluster_name", "(unknown)"), "|", info.get("version", {}).get("number", ""))

# -------------------------
# Create index (BM25)
# -------------------------
INDEX_NAME = os.getenv("INDEX_NAME", "ir2025_phase1_bm25")
BM25_K1 = float(os.getenv("BM25_K1", "1.2"))
BM25_B  = float(os.getenv("BM25_B",  "0.75"))

index_body = {
    "settings": {
        "analysis": {
            "analyzer": {
                "english_analyzer": {
                    "type": "standard",
                    "stopwords": "_english_"
                }
            }
        },
        "similarity": {
            "my_bm25": {"type": "BM25", "k1": BM25_K1, "b": BM25_B}
        }
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "text":   {"type": "text", "analyzer": "english_analyzer", "similarity": "my_bm25"}
        }
    }
}

try:
    es.indices.delete(index=INDEX_NAME)
    print(f"\n[i] Deleted existing index: {INDEX_NAME}")
except NotFoundError:
    pass

es.indices.create(index=INDEX_NAME, body=index_body)
print(f"[OK] Created index '{INDEX_NAME}' with BM25(k1={BM25_K1}, b={BM25_B})")

# -------------------------
# BULK INDEXING
# -------------------------
def doc_actions(df, index_name):
    for _, r in df.iterrows():
        yield {
            "_op_type": "index",
            "_index": index_name,
            "_id": r["doc_id"],
            "_source": {"doc_id": r["doc_id"], "text": r["text"]}
        }

print("\n[BULK] Indexing documents...")
helpers.bulk(es, doc_actions(df_docs, INDEX_NAME), chunk_size=1000, request_timeout=120)
es.indices.refresh(index=INDEX_NAME)

count = es.count(index=INDEX_NAME)["count"]
print(f"[OK] Indexed docs in ES: {count}")



[OK] Elasticsearch: elasticsearch | 9.2.1

[i] Deleted existing index: ir2025_phase1_bm25
[OK] Created index 'ir2025_phase1_bm25' with BM25(k1=1.2, b=0.75)

[BULK] Indexing documents...


  helpers.bulk(es, doc_actions(df_docs, INDEX_NAME), chunk_size=1000, request_timeout=120)


[OK] Indexed docs in ES: 18316


Œë3: ŒïŒ∫œÑŒ≠ŒªŒµœÉŒ∑ queries + Œ¥Œ∑ŒºŒπŒøœÖœÅŒ≥ŒØŒ± run files (k=20/30/50)

In [7]:
# -------------------------
# Search + write run files (k=20,30,50)
# -------------------------
def search_topk(query_text: str, k: int):
    res = es.search(
        index=INDEX_NAME,
        size=k,
        query={"match": {"text": query_text}}
    )
    return [(h["_id"], h["_score"]) for h in res["hits"]["hits"]]

def write_run(k: int) -> Path:
    run_path = OUT_DIR / f"run_bm25_k{k}.txt"
    run_tag  = f"bm25_k{k}"
    lines = 0

    with open(run_path, "w", encoding="utf-8") as f:
        for _, row in tqdm(df_q.iterrows(), total=len(df_q), desc=f"Queries k={k}"):
            qid = row["qid"]
            qtext = row["query"]

            hits = search_topk(qtext, k)
            for rank, (docid, score) in enumerate(hits, start=1):
                f.write(f"{qid} Q0 {docid} {rank} {score:.6f} {run_tag}\n")
                lines += 1

    print(f"[OK] Wrote {run_path.name} ({lines} lines)")
    return run_path

run20 = write_run(20)
run30 = write_run(30)
run50 = write_run(50)


Queries k=20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 13.06it/s]


[OK] Wrote run_bm25_k20.txt (200 lines)


Queries k=30: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 34.77it/s]


[OK] Wrote run_bm25_k30.txt (300 lines)


Queries k=50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 37.98it/s]

[OK] Wrote run_bm25_k50.txt (500 lines)





Œë4: ŒëŒæŒπŒøŒªœåŒ≥Œ∑œÉŒ∑ ŒºŒµ trec_eval + ŒµŒæŒ±Œ≥œâŒ≥ŒÆ Œ±œÄŒøœÑŒµŒªŒµœÉŒºŒ¨œÑœâŒΩ œÉŒµ CSV

In [None]:
import subprocess
import pandas as pd
import re
from pathlib import Path

pd.options.display.float_format = '{:.5f}'.format

treceval_dir_input = input("ŒîœéœÉŒµ œÑŒø path œÑŒøœÖ trec_eval.exe: ").strip().strip('"').strip("'")
TREC_EVAL_EXE = Path(treceval_dir_input)
if not TREC_EVAL_EXE.exists():
    raise FileNotFoundError(f"ŒîŒµŒΩ Œ≤œÅŒ≠Œ∏Œ∑Œ∫Œµ: {TREC_EVAL_EXE}")

RUN_FILES = [
    OUT_DIR / "run_bm25_k20.txt",
    OUT_DIR / "run_bm25_k30.txt",
    OUT_DIR / "run_bm25_k50.txt",
]

KS = [5, 10, 15, 20]

def run_trec_eval_for_k(trec_eval_exe, qrels, run_path, k):
    cmd = [str(trec_eval_exe), "-q", "-m", "map", "-m", f"P.{k}", str(qrels), str(run_path)]
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    return result.stdout

def parse_output_to_df(output):
    rows = []
    for line in output.strip().splitlines():
        parts = re.split(r"\s+", line.strip())
        if len(parts) != 3:
            continue
        metric, qid, val = parts

        m = re.search(r"^P[._]?(\d+)$", metric)
        if m:
            metric = f"P@{m.group(1)}"

        rows.append({"Query": qid.strip().replace("\ufeff", ""), "Metric": metric, "Value": float(val)})
    return pd.DataFrame(rows)

summary_rows = []

for run_path in RUN_FILES:
    print(f"\nŒïŒ∫œÑŒµŒªŒµŒØœÑŒ±Œπ Œ±ŒæŒπŒøŒªœåŒ≥Œ∑œÉŒ∑ Œ≥ŒπŒ±: {run_path.name}")
    combined_df = pd.DataFrame()

    for k in KS:
        out = run_trec_eval_for_k(TREC_EVAL_EXE, QRELS_TXT, run_path, k)
        df_k = parse_output_to_df(out)
        pivot_k = df_k.pivot(index="Query", columns="Metric", values="Value")
        combined_df = pd.concat([combined_df, pivot_k], axis=1)

    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

    ordered_cols = ["map"] + [f"P@{k}" for k in KS]
    final = combined_df.reindex(columns=ordered_cols)

    display(final)

    per_query_csv = OUT_DIR / f"{run_path.stem}_per_query.csv"
    final.to_csv(per_query_csv)
    print("üíæ Saved:", per_query_csv)

    if "all" in final.index:
        row = final.loc["all"].to_dict()
        row["run"] = run_path.name
        summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows).set_index("run")
summary_df = summary_df.reindex(columns=["map", "P@5", "P@10", "P@15", "P@20"])

print("\n=== Œ£Œ•ŒìŒöŒïŒùŒ§Œ°Œ©Œ§ŒôŒöŒë (all) Œ≥ŒπŒ± œåŒªŒ± œÑŒ± runs ===")
display(summary_df)

summary_csv = OUT_DIR / "phase1_trec_eval_summary_all.csv"
summary_df.to_csv(summary_csv)
print("üíæ Saved:", summary_csv)

print("\nDONE ‚Äî per query + all summary")



ŒïŒ∫œÑŒµŒªŒµŒØœÑŒ±Œπ Œ±ŒæŒπŒøŒªœåŒ≥Œ∑œÉŒ∑ Œ≥ŒπŒ±: run_bm25_k20.txt


Metric,map,P@5,P@10,P@15,P@20
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q01,0.6228,0.8,0.8,0.7333,0.65
Q02,0.3261,0.6,0.3,0.2667,0.3
Q03,0.7097,0.8,0.7,0.6667,0.6
Q04,0.2975,0.6,0.4,0.2667,0.3
Q05,0.7066,1.0,0.8,0.7333,0.65
Q06,0.6926,1.0,1.0,0.8,0.7
Q07,0.6413,1.0,0.8,0.7333,0.6
Q08,0.4694,0.8,0.7,0.4667,0.45
Q09,0.6769,1.0,0.9,0.8667,0.75
Q10,0.3198,0.6,0.3,0.2667,0.25


üíæ Saved: C:\Users\ArisK\Desktop\dawdawdawd\run_bm25_k20_per_query.csv

ŒïŒ∫œÑŒµŒªŒµŒØœÑŒ±Œπ Œ±ŒæŒπŒøŒªœåŒ≥Œ∑œÉŒ∑ Œ≥ŒπŒ±: run_bm25_k30.txt


Metric,map,P@5,P@10,P@15,P@20
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q01,0.6969,0.8,0.8,0.7333,0.65
Q02,0.3261,0.6,0.3,0.2667,0.3
Q03,0.7407,0.8,0.7,0.6667,0.6
Q04,0.3184,0.6,0.4,0.2667,0.3
Q05,0.7378,1.0,0.8,0.7333,0.65
Q06,0.7651,1.0,1.0,0.8,0.7
Q07,0.7136,1.0,0.8,0.7333,0.6
Q08,0.5703,0.8,0.7,0.4667,0.45
Q09,0.7426,1.0,0.9,0.8667,0.75
Q10,0.3993,0.6,0.3,0.2667,0.25


üíæ Saved: C:\Users\ArisK\Desktop\dawdawdawd\run_bm25_k30_per_query.csv

ŒïŒ∫œÑŒµŒªŒµŒØœÑŒ±Œπ Œ±ŒæŒπŒøŒªœåŒ≥Œ∑œÉŒ∑ Œ≥ŒπŒ±: run_bm25_k50.txt


Metric,map,P@5,P@10,P@15,P@20
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q01,0.7291,0.8,0.8,0.7333,0.65
Q02,0.3827,0.6,0.3,0.2667,0.3
Q03,0.7407,0.8,0.7,0.6667,0.6
Q04,0.3488,0.6,0.4,0.2667,0.3
Q05,0.7904,1.0,0.8,0.7333,0.65
Q06,0.7849,1.0,1.0,0.8,0.7
Q07,0.7429,1.0,0.8,0.7333,0.6
Q08,0.59,0.8,0.7,0.4667,0.45
Q09,0.8487,1.0,0.9,0.8667,0.75
Q10,0.4266,0.6,0.3,0.2667,0.25


üíæ Saved: C:\Users\ArisK\Desktop\dawdawdawd\run_bm25_k50_per_query.csv

=== Œ£Œ•ŒìŒöŒïŒùŒ§Œ°Œ©Œ§ŒôŒöŒë (all) Œ≥ŒπŒ± œåŒªŒ± œÑŒ± runs ===


Unnamed: 0_level_0,map,P@5,P@10,P@15,P@20
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
run_bm25_k20.txt,0.5463,0.82,0.67,0.58,0.525
run_bm25_k30.txt,0.6011,0.82,0.67,0.58,0.525
run_bm25_k50.txt,0.6385,0.82,0.67,0.58,0.525


üíæ Saved: C:\Users\ArisK\Desktop\dawdawdawd\phase1_trec_eval_summary_all.csv

‚úÖ DONE ‚Äî per query + all summary
