In [2]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import subprocess
import os
import re
import time


es = Elasticsearch("http://localhost:9200")

file_docs = '../data/documents.csv'
file_queries = '../data/queries.csv'
qrels_file = "../data/qrels.txt"
trec_eval_path = "../../trec_eval/trec_eval.exe"

param_grid = {
    "b": [0.5, 0.6, 0.75, 0.9, 1.0],      
    "k1": [1.2, 1.6, 2.0, 3.0]
}

k_targets = [20, 30, 50]

print(f"Status: {es.ping()}")

Status: True


In [3]:
def preprocess_text(text):
    if not isinstance(text, str): return ""
    
    text = text.replace("\n", " ").replace("•", " ")
    
    stop_phrases = [
        "aims to", 
        "is aimed at",
        "the aim of",
        "the project objectives will be realised by",
        "will research and demonstrate",
        "will research develop and exploit",
        "brings together research partners",
        "consortium is composed",
        "integrates multidisciplinary research teams",
        "proven track record",
        "industrial advisory board",
        "support the eu agenda",
        "real-world pilots",
        "market-relevant outcomes",
        "significant exploitation potential",
        "the main objective of this project", 
        "this project aims to", 
        "this proposal aims to", 
        "the proposed research",
        "will be carried out", 
        "state of the art", 
        "beyond the state of the art",
        "the overall objective",
        "will be implemented", 
        "will be demonstrated", 
        "will be validated",
        "proof of concept", 
        "feasibility study", 
        "clinical validation of a",
        "marie sklodowska-curie", "marie curie", 
        "european union", "horizon 2020", "h2020", "fp7", 
        "grant agreement", "work package", "consortium members", 
        "research and innovation", "career development"
    ]
    
    custom_stopwords = [
        "project", "proposal", "consortium", "partner", "deliverable", 
        "workpackage", "methodology" 
    ]
    
    text_lower = text.lower()
    
    for phrase in stop_phrases:
        if phrase in text_lower:
            text = re.sub(re.escape(phrase), " ", text, flags=re.IGNORECASE)
            
    for word in custom_stopwords:
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
            
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

print("Εφαρμογή βελτιωμένου καθαρισμού (Prefix & Stopword Removal)...")


df_docs = pd.read_csv(file_docs).dropna(subset=['Text'])
df_docs['Text'] = df_docs['Text'].astype(str).apply(preprocess_text)
if 'ID' in df_docs.columns: df_docs['ID'] = df_docs['ID'].astype(str)


df_queries = pd.read_csv(file_queries)
df_queries.columns = df_queries.columns.str.strip()
df_queries['Text'] = df_queries['Text'].astype(str).apply(preprocess_text)

print(f"Ολοκληρώθηκε! Τα κείμενα είναι τώρα απαλλαγμένα από boilerplate.")

Εφαρμογή βελτιωμένου καθαρισμού (Prefix & Stopword Removal)...
Ολοκληρώθηκε! Τα κείμενα είναι τώρα απαλλαγμένα από boilerplate.


In [4]:

best_results = {k: {"map": 0.0, "params": None} for k in k_targets}


grid_index_name = "ir2025_grid_search_temp"

total_combinations = len(param_grid['b']) * len(param_grid['k1'])
current_run = 0

for b in param_grid['b']:
    for k1 in param_grid['k1']:
        current_run += 1
        print(f"\n[{current_run}/{total_combinations}] Testing: b={b}, k1={k1}")
        
        if es.indices.exists(index=grid_index_name):
            es.indices.delete(index=grid_index_name)
            
        settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "index": {
                    "similarity": {
                        "custom_bm25": { 
                            "type": "BM25",
                            "b": b,
                            "k1": k1
                        }
                    }
                },
                "analysis": {
                    "analyzer": {
                        "research_analyzer": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter": ["lowercase", "stop", "porter_stem"]
                        }
                    },
                    "filter": {
                        "stop": { "type": "stop", "stopwords": "_english_" },
                        "porter_stem": { "type": "stemmer", "language": "english" }
                    }
                }
            },
            "mappings": {
                "properties": {
                    "text": {
                        "type": "text",
                        "analyzer": "research_analyzer",
                        "similarity": "custom_bm25"
                    }
                }
            }
        }
        es.indices.create(index=grid_index_name, body=settings)
        
        def generate_docs():
            for _, row in df_docs.iterrows():
                yield {
                    "_index": grid_index_name,
                    "_source": {"doc_id": row['ID'], "text": row['Text']}
                }
        bulk(es, generate_docs())
        es.indices.refresh(index=grid_index_name)
        
        for k in k_targets:
            results_file = f"../results/grid_b{b}_k1{k1}_k{k}.txt"
            
            with open(results_file, 'w') as f:
                for _, row in df_queries.iterrows():
                    if not row['Text']: continue
                    
                    response = es.search(
                        index=grid_index_name,
                        body={"query": {"match": {"text": row['Text']}}},
                        size=k
                    )
                    
                    for rank, hit in enumerate(response['hits']['hits']):
                        doc_id = hit['_source']['doc_id']
                        score = hit['_score']
                        f.write(f"{row['ID']}\tQ0\t{doc_id}\t{rank+1}\t{score:.4f}\tgrid_run\n")
            
            try:
                cmd = [trec_eval_path, "-m", "map", qrels_file, results_file]
                res = subprocess.run(cmd, capture_output=True, text=True)
                
                map_score = float(res.stdout.split()[-1])
                
                if map_score > best_results[k]['map']:
                    best_results[k]['map'] = map_score
                    best_results[k]['params'] = (b, k1)
                    print(f"New Best for k={k}: MAP = {map_score:.4f}")
                    
                os.remove(results_file)
                
            except Exception as e:
                print(f"Error evaluating k={k}: {e}")

if es.indices.exists(index=grid_index_name):
    es.indices.delete(index=grid_index_name)


[1/20] Testing: b=0.5, k1=1.2


  response = es.search(


New Best for k=20: MAP = 0.6707
New Best for k=30: MAP = 0.7451
New Best for k=50: MAP = 0.7607

[2/20] Testing: b=0.5, k1=1.6
New Best for k=20: MAP = 0.6952
New Best for k=30: MAP = 0.7659
New Best for k=50: MAP = 0.7802

[3/20] Testing: b=0.5, k1=2.0
New Best for k=20: MAP = 0.6967
New Best for k=50: MAP = 0.7815

[4/20] Testing: b=0.5, k1=3.0

[5/20] Testing: b=0.6, k1=1.2

[6/20] Testing: b=0.6, k1=1.6
New Best for k=20: MAP = 0.7073
New Best for k=50: MAP = 0.7843

[7/20] Testing: b=0.6, k1=2.0
New Best for k=20: MAP = 0.7155
New Best for k=30: MAP = 0.7694
New Best for k=50: MAP = 0.7880

[8/20] Testing: b=0.6, k1=3.0
New Best for k=30: MAP = 0.7716
New Best for k=50: MAP = 0.7885

[9/20] Testing: b=0.75, k1=1.2

[10/20] Testing: b=0.75, k1=1.6
New Best for k=20: MAP = 0.7283
New Best for k=30: MAP = 0.7737
New Best for k=50: MAP = 0.7952

[11/20] Testing: b=0.75, k1=2.0
New Best for k=30: MAP = 0.7795

[12/20] Testing: b=0.75, k1=3.0
New Best for k=20: MAP = 0.7295
New Best for

In [5]:
print("\n" + "="*60)
print("ΤΕΛΙΚΑ ΑΠΟΤΕΛΕΣΜΑΤΑ GRID SEARCH (ME PREPROCESSING)")
print("="*60)

for k in k_targets:
    res = best_results[k]
    print(f"\nΓια k = {k}:")
    print(f"Best MAP: {res['map']:.4f}")
    print(f"Params:   b = {res['params'][0]}, k1 = {res['params'][1]}")
    


ΤΕΛΙΚΑ ΑΠΟΤΕΛΕΣΜΑΤΑ GRID SEARCH (ME PREPROCESSING)

Για k = 20:
Best MAP: 0.7373
Params:   b = 1.0, k1 = 2.0

Για k = 30:
Best MAP: 0.7851
Params:   b = 1.0, k1 = 2.0

Για k = 50:
Best MAP: 0.8039
Params:   b = 1.0, k1 = 2.0
