In [1]:
import pandas as pd
import numpy as np
import faiss
import os
import sys
import subprocess
import re
import html
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler


DOCS_PATH = "../data/documents.csv"
QUERIES_PATH = "../data/queries.csv"
QRELS_PATH = "../data/qrels.txt"
TREC_EVAL_PATH = "../../trec_eval/trec_eval.exe" 
RESULTS_DIR = "../results_phase3"

MODEL_NAME = 'all-MiniLM-L6-v2'
ES_INDEX = "ir_phase1_showcase"  

N_STAGE1 = 200  
K_STAGE2 = 50  

ALPHA = 0.5 
BETA = 0.5  

In [2]:
def clean_text_phase2(text):
    if not isinstance(text, str): return ""
    text = html.unescape(text) 
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def load_documents_map(path):
    try:
        df = pd.read_csv(path, header=None, dtype=str)
        doc_map = {}
        for _, row in df.iterrows():
          
            if pd.isna(row[0]) or str(row[0]).strip() == "": continue

            doc_id = str(row[0]).strip()
            raw_text = str(row[1])
            doc_map[doc_id] = clean_text_phase2(raw_text)
            
        return doc_map
    except Exception as e:
        sys.exit(f"e")


In [3]:
def save_and_eval(all_results):
    k_levels = [20, 30, 50]
    for k in k_levels:
        run_id = f"phase3_results_k{k}"
        filename = os.path.join(RESULTS_DIR, f"{run_id}.txt")
        
        with open(filename, 'w', encoding='utf-8') as f:
            for qid, docid, score, rank in all_results:
                if rank <= k:
                    f.write(f"{qid}\tQ0\t{docid}\t{rank}\t{score:.4f}\t{run_id}\n")
        
        if os.path.exists(TREC_EVAL_PATH):
            print(f"\nΑποτελέσματα για k={k}")
            cmd = [TREC_EVAL_PATH, "-m", "map", "-m", "P.5,10,15,20", "-m", "recall.5,10,15,20,50", QRELS_PATH, filename]
            res = subprocess.run(cmd, capture_output=True, text=True)
            print(res.stdout.strip())

In [4]:
def stage1_retrieve(es, query_text, doc_lookup):
    cleaned_query = clean_text_phase2(query_text)
    
    try:
        res = es.search(
            index=ES_INDEX,
            body={
                "query": {
                    "match": {
                        "text": { 
                            "query": cleaned_query,
                            "fuzziness": "AUTO"
                        }
                    }
                },
                "size": N_STAGE1,
                "_source": ["ID", "doc_id", "id"] 
            }
        )
    except Exception as e:
        print(e)
        return []
    
    hits = res['hits']['hits']
    candidates = []
    
    for h in hits:
        src = h['_source']
        real_id = str(src.get('ID') or src.get('doc_id') or h['_id'])
        
        text_content = doc_lookup.get(real_id, "")
        
        candidates.append({
            'id': real_id,
            'text': text_content,
            'bm25_score': h['_score']
        })
        
    return candidates

In [None]:
def stage2_rerank(model, query_text, candidates):
    if not candidates: return []

    cand_texts = [c['text'] for c in candidates]
    cand_ids = [c['id'] for c in candidates]
    bm25_scores = [c['bm25_score'] for c in candidates]
    
    cleaned_query = clean_text_phase2(query_text)

    doc_emb = model.encode(cand_texts, convert_to_numpy=True)
    qry_emb = model.encode([cleaned_query], convert_to_numpy=True)
    
    faiss.normalize_L2(doc_emb)
    faiss.normalize_L2(qry_emb)
    
    d = doc_emb.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(doc_emb)
    
    vec_scores, vec_indices = index.search(qry_emb, len(cand_texts))
    
    
    if len(bm25_scores) > 1:
        scaler = MinMaxScaler()
        bm25_norm = scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1)).flatten()
    else:
        bm25_norm = np.array([1.0] * len(bm25_scores))
        
    
    bm25_aligned = bm25_norm[vec_indices[0]]
    final_scores = (ALPHA * bm25_aligned) + (BETA * vec_scores[0])
    sorted_idx = np.argsort(final_scores)[::-1]
    
    results = []
    for rank, idx in enumerate(sorted_idx[:K_STAGE2]):
        original_idx = vec_indices[0][idx]
        final_doc_id = cand_ids[original_idx]
        
        results.append({
            'doc_id': final_doc_id,
            'score': final_scores[idx],
            'rank': rank + 1
        })
        
    return results

In [6]:
def run_pipeline():
    if not os.path.exists(RESULTS_DIR): os.makedirs(RESULTS_DIR)

    model = SentenceTransformer(MODEL_NAME)
    es = Elasticsearch("http://localhost:9200")
    doc_lookup = load_documents_map(DOCS_PATH)
    
    df_q = pd.read_csv(QUERIES_PATH, header=0, dtype=str)
    if df_q.empty: df_q = pd.read_csv(QUERIES_PATH, header=None, dtype=str)
    queries = list(zip(df_q.iloc[:,0], df_q.iloc[:,1]))
    
    all_final_results = []
    
    
    for i, (q_id, q_text) in enumerate(queries):
        
        candidates = stage1_retrieve(es, q_text, doc_lookup)
        if not candidates: continue 
        top_results = stage2_rerank(model, q_text, candidates)
        
        for res in top_results:
            all_final_results.append((
                q_id, res['doc_id'], res['score'], res['rank']
            ))

    save_and_eval(all_final_results)

In [7]:
if __name__ == "__main__":
    run_pipeline()


Αποτελέσματα για k=20
map                   	all	0.6374
P_5                   	all	0.8800
P_10                  	all	0.7600
P_15                  	all	0.6800
P_20                  	all	0.5700
recall_5              	all	0.2956
recall_10             	all	0.5039
recall_15             	all	0.6764
recall_20             	all	0.7510
recall_50             	all	0.7510

Αποτελέσματα για k=30
map                   	all	0.6943
P_5                   	all	0.8800
P_10                  	all	0.7600
P_15                  	all	0.6800
P_20                  	all	0.5700
recall_5              	all	0.2956
recall_10             	all	0.5039
recall_15             	all	0.6764
recall_20             	all	0.7510
recall_50             	all	0.8545

Αποτελέσματα για k=50
map                   	all	0.7341
P_5                   	all	0.8800
P_10                  	all	0.7600
P_15                  	all	0.6800
P_20                  	all	0.5700
recall_5              	all	0.2956
recall_10             	all	0.5039
recall_15    