In [None]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import re
import html
import unicodedata
import subprocess
import os
import nltk
from nltk.corpus import stopwords

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

es = Elasticsearch("http://localhost:9200")

file_docs = '../data/documents.csv'
file_queries = '../data/queries.csv'
qrels_file = "../data/qrels.txt"
trec_eval_path = "../../trec_eval/trec_eval.exe"

index_name = "ir_phase1_index"

In [None]:
nltk_stops = set(stopwords.words('english'))
safe_noise = {
    "project", "aim", "proposal", "consortium", "summary", 
    "objective", "work", "package", "action", "activities",
    "main", "specific", "within", "during", "result", "presented",
    "paper", "study", "report", "deliverable", "task", "partners",
    
}
full_stopwords_list = list(nltk_stops.union(safe_noise))

def analyze_and_clean_safe(text):
    if not isinstance(text, str): return ""
    text = html.unescape(text)
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_docs = pd.read_csv(file_docs, header=None, names=['ID', 'Text'], dtype=str)
df_docs = df_docs[df_docs['ID'].str.lower() != 'id'].dropna(subset=['Text'])
df_docs['clean_text'] = df_docs['Text'].apply(analyze_and_clean_safe)

df_queries = pd.read_csv(file_queries, header=None, names=['ID', 'Text'], dtype=str)
df_queries = df_queries[df_queries['ID'].str.lower() != 'id'].dropna(subset=['Text'])
df_queries['clean_text'] = df_queries['Text'].apply(analyze_and_clean_safe)

print(f"Έτοιμα δεδομένα: {len(df_docs)} docs, {len(df_queries)} queries.")

In [None]:
settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "filter": {
                "my_custom_stop": {
                    "type": "stop",
                    "stopwords": full_stopwords_list 
                },
                "my_length_filter": {
                    "type": "length",
                    "min": 2
                },
                "my_synonyms": {
                    "type": "synonym",
                    "synonyms": [
                        "uk, united kingdom",
                        "eu, european union",
                        "us, usa, united states",
                        "ai, artificial intelligence",
                        "sme, small medium enterprise, smes, startup, start up",
                        "r d, research development, research and development",
                        "h2020, horizon 2020",
                        "fp7, framework programme 7",
                        "res, renewable energy sources, renewables",
                        "pv, photovoltaic, solar, solar energy",
                        "wind, wind energy, wind power, wind farm",
                        "co2, carbon dioxide, greenhouse gas, ghg",
                        "ngos, ngo, non governmental organization",
                        "cancer, tumor, oncology",
                        "hiv, aids"
                    ]
                },
                "my_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                }
            },
            "analyzer": {
                "final_analyzer": {
                    "type": "custom",
                    "tokenizer": "whitespace",
                    "filter": [
                        "lowercase", 
                        "my_synonyms",      
                        "my_length_filter", 
                        "my_custom_stop", 
                        "my_stemmer"
                    ]
                }
            }
        },
        "index": {
            "similarity": {
                "default": { "type": "BM25", "b": 0.9, "k1": 2.0 } 
            }
        }
    },
    "mappings": {
        "properties": {
            "text": { "type": "text", "analyzer": "final_analyzer" }
        }
    }
}

if es.indices.exists(index=index_name): es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=settings)

bulk(es, ({
    "_index": index_name,
    "_source": { "doc_id": row['ID'], "text": row['clean_text'] }
} for _, row in df_docs.iterrows()))
es.indices.refresh(index=index_name)

print("Indexing ολοκληρώθηκε.")

In [None]:
k_scenarios = [20, 30, 50] 

for k in k_scenarios:
    output_file = f"../results/phase1_results_k{k}.txt"
    run_id = f"phase1_final_k{k}"
    
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    with open(output_file, 'w') as f:
        for _, row in df_queries.iterrows():
            if not row['clean_text']: continue
            
            query_body = {
                "query": {
                    "bool": {
                        "must": {
                            "match": { "text": row['clean_text'] }
                        },
                        "should": [
                            { 
                                "match_phrase": { 
                                    "text": { 
                                        "query": row['clean_text'], 
                                        "slop": 0, 
                                        "boost": 10.0 
                                    } 
                                } 
                            },
                            { 
                                "match_phrase": { 
                                    "text": { 
                                        "query": row['clean_text'], 
                                        "slop": 2, 
                                        "boost": 2.0 
                                    } 
                                } 
                            }
                        ]
                    }
                },
                "size": k
            }
            
            res = es.search(index=index_name, body=query_body)
            
            for rank, hit in enumerate(res['hits']['hits']):
                f.write(f"{row['ID']}\tQ0\t{hit['_source']['doc_id']}\t{rank+1}\t{hit['_score']:.4f}\t{run_id}\n")
    
    print(f" -> Αποτελέσματα για k={k} αποθηκεύτηκαν.")
    

    print(f"Metrics για k={k}:")
    try:
        cmd = [trec_eval_path, "-m", "map", "-m", "P.5,10,15,20", "-m", "recall.5,10,15,20,50", qrels_file, output_file]
        res = subprocess.run(cmd, capture_output=True, text=True)
        print(res.stdout)
    except Exception as e:
        print(f"Error: {e}")
