In [None]:
%pip install pandas elasticsearch

In [None]:
from elasticsearch import Elasticsearch
import pandas as pd
from elasticsearch.helpers import bulk


es = Elasticsearch("http://localhost:9200")

print(es.info())

In [None]:
file_path = '../data/documents.csv' 

try:
    df = pd.read_csv(file_path)
    print("Το αρχείο φορτώθηκε επιτυχώς!")
except FileNotFoundError:
    print(f"Πρόσοχή: Το αρχείο δεν βρέθηκε στη διαδρομή: {file_path}")


if 'Text' in df.columns:
    initial_rows = len(df)
    df = df.dropna(subset=['Text'])
    print(f"\nΑφαιρέθηκαν {initial_rows - len(df)} γραμμές με κενό κείμενο.")

    df['Text'] = df['Text'].astype(str)

if 'ID' in df.columns:
    df['ID'] = df['ID'].astype(str)


In [None]:
index_name = "ir2025_index"

settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english" 
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"}, 
            "text": {
                "type": "text",
                "similarity": "BM25"
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name, body=settings)

In [None]:
index_name = "ir2025_index"

def generate_data(dataframe):
    for index, row in dataframe.iterrows():
        yield {
            "_index": index_name,
            "_source": {
                "doc_id": str(row['ID']), 
                "text": row['Text']
            }
        }

print("Ξεκινάει η εισαγωγή των κειμένων...")

success, failed = bulk(es, generate_data(df))

print(f"Ολοκληρώθηκε! Επιτυχίες: {success}, Αποτυχίες: {failed}")

es.indices.refresh(index=index_name)
print("Το ευρετήριο ανανεώθηκε και είναι έτοιμο για αναζήτηση.")

In [None]:
queries_path = '../data/queries.csv'
queries_df = pd.read_csv(queries_path)

queries_df.columns = queries_df.columns.str.strip()

qid_col = 'ID'
qtext_col = 'Text'

output_file = "../results/phase1_results.txt"  
run_id = "my_elastic_bm25" 

print("Ξεκινάει η αναζήτηση...")

with open(output_file, 'w') as f:
    for index, row in queries_df.iterrows():
        q_id = str(row[qid_col])
        q_text = row[qtext_col]
        
        if pd.isna(q_text):
            continue

        response = es.search(
            index="ir2025_index",
            body={
                "query": {
                    "match": {
                        "text": q_text 
                    }
                },
                "size": 50 
            }
        )

        hits = response['hits']['hits']
        
        for rank, hit in enumerate(hits):
            doc_id = hit['_source']['doc_id']
            score = hit['_score']
            
            line = f"{q_id}\tQ0\t{doc_id}\t{rank+1}\t{score:.4f}\t{run_id}\n"
            f.write(line)

print(f"Η αναζήτηση ολοκληρώθηκε! Τα αποτελέσματα είναι στο: {output_file}")

In [None]:
path_to_trec_eval = "../../trec_eval/trec_eval.exe" 
qrels_file = "../data/qrels.txt"
results_file = "../results/phase1_results.txt"

command = [
    path_to_trec_eval,
    "-m", "map",           
    "-m", "P.5,10,15,20",  
    qrels_file,
    results_file
]

print(' '.join(command))
print("-" * 40)

try:

    result = subprocess.run(
        command, 
        capture_output=True, 
        text=True, 
        check=True
    )
    
    print(result.stdout)
    
except FileNotFoundError:
    print("ΣΦΑΛΜΑ: Δεν βρέθηκε το trec_eval.exe! Έλεγξε τη διαδρομή (path_to_trec_eval).")
except subprocess.CalledProcessError as e:
    print("ΣΦΑΛΜΑ κατά την εκτέλεση του trec_eval:")
    print(e.stderr)