In [1]:
import sys
sys.path.append('/work/utilis')
from lib_to_use import *
from general_functions import *
from paths import *

Torch version:  2.6.0+cu124
  from .autonotebook import tqdm as notebook_tqdm
Pyterrier version:  0.13.0
  demoji.download_codes()


In [2]:
# load qrels_val
with open (val_processed_path, 'rb') as f:
    df_val = pickle.load(f)
qrels_val = load_qrels(qrels_val_path)
qrels_val_df = qrels_to_df(qrels_val)

# load df_val
with open(val_processed_path, "rb") as f:
    df_val = pickle.load(f)


In [3]:
# Specify the same index path used during indexing
index_path = "./pyterrier_index"

# Load the index previously built (using the IndexFactory)
index_ref = pt.IndexFactory.of(index_path)


Java started (triggered by IndexFactory.of) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


    Ottimizza i parametri BM25 usando grid search.
    Args:
        queries_df: DataFrame con colonne ['qid', 'query']
        qrels_df: DataFrame con colonne ['qid', 'docno', 'label']
        index_ref: Riferimento all'indice PyTerrier
        k1_range: Range di valori per k1 (term frequency scaling)
        b_range: Range di valori per b (length normalization)
        metric: Metrica da ottimizzare
        num_results: Numero di risultati da recuperare per query
    
    Returns:
        best_params: Dizionario con i migliori parametri {'k1': val, 'b': val}
        best_score: Punteggio ottenuto con i migliori parametri
    

In [11]:
def optimize_bm25_parameters(
    queries_df: pd.DataFrame,
    qrels_df: pd.DataFrame,
    index_ref,
    k1_range=np.linspace(0.1, 3.0, 10),
    b_range=np.linspace(0.1, 1.0, 10),
    metric='P_1',
    num_results=100
) -> Tuple[Dict[str, float], float]:
    
    best_score = 0
    best_params = None
    results = []
    
    for k1 in k1_range:
        for b in b_range:
            # Configura BM25 con i parametri correnti
            bm25 = pt.terrier.Retriever(
                index_ref,
                wmodel="BM25",
                controls={"k_1": k1, "b": b},
                num_results=num_results
            )
            
            # Esegui il retrieval
            run_results = bm25.transform(queries_df)
            
            # Calcola il punteggio
            score = pt.Evaluate(
                run_results,
                qrels_df,
                metrics=[metric]
            )[metric]
            
            results.append({
                'k_1': k1,
                'b': b,
                metric: score
            })
            
            if score > best_score:
                best_score = score
                best_params = {'k_1': k1, 'b': b}
    
    # Crea un DataFrame con tutti i risultati per analisi
    results_df = pd.DataFrame(results)
    
    print(f"\nMigliori parametri trovati:")
    print(f"k_1 = {best_params['k_1']:.3f}")
    print(f"b  = {best_params['b']:.3f}")
    print(f"{metric} = {best_score:.3f}")
    
    return best_params, best_score

In [12]:
# Esempio di utilizzo:
def run_bm25_optimization():
    # Carica i dati di validation
    
    # Ottimizza i parametri
    best_params, best_score = optimize_bm25_parameters(
        queries_df=df_val[['qid', 'query']],
        qrels_df=qrels_val_df,
        index_ref=index_ref,
        metric='P_1'
    )
    
    # Crea il modello BM25 finale con i parametri ottimizzati
    optimized_bm25 = pt.terrier.Retriever(
        index_ref,
        wmodel="BM25",
        controls={
            "k_1": best_params['k_1'],
            "b": best_params['b']
        }
    )
    
    return optimized_bm25

In [13]:
start_time = time.time()
bm25 = run_bm25_optimization()
print(f"Time to optimize BM25: {time.time() - start_time}")


Migliori parametri trovati:
k_1 = 0.100
b  = 0.100
P_1 = 0.714
Time to optimize BM25: 54.85717797279358


In [14]:
start_time = time.time()
bm25_results = bm25.transform(df_val[['qid', 'query']])
print(f"Time to run BM25: {time.time() - start_time}")
bm25_results

Time to run BM25: 3.035609245300293


Unnamed: 0,qid,docid,docno,rank,score,query
0,academia_143743,66,academia_28111,0,24.197326,answer question one ask histor research came a...
1,academia_143743,14,academia_12035,1,23.709126,answer question one ask histor research came a...
2,academia_143743,147,academia_76526,2,21.791199,answer question one ask histor research came a...
3,academia_143743,3441,law_5712,3,21.178890,answer question one ask histor research came a...
4,academia_143743,6525,scifi_117801,4,18.967244,answer question one ask histor research came a...
...,...,...,...,...,...,...
101443,politics_53742,3173,judaism_48851,995,4.306271,u polic treat use counterfeit money emerg one ...
101444,politics_53742,8995,english_284054,996,4.305917,u polic treat use counterfeit money emerg one ...
101445,politics_53742,665,boardgames_6033,997,4.304795,u polic treat use counterfeit money emerg one ...
101446,politics_53742,3285,judaism_6672,998,4.304599,u polic treat use counterfeit money emerg one ...


In [22]:
# Per valutare i risultati BM25
eval_bm25 = pt.Evaluate(bm25_results, qrels_val_df, metrics=["P_1", "recall_100", "map_cut_100", "ndcg_cut_3"])
print("BM25 Evaluation:", eval_bm25)

BM25 Evaluation: {'P_1': 0.7142857142857143, 'recall_100': 0.9285714285714286, 'map_cut_100': 0.7731338936073157, 'ndcg_cut_3': 0.7722282426749296}


In [24]:
with open("/work/PIR_data_unzip/PIR_data/answer_retrieval/saved_results/bm25_optimized_results.pkl", "wb") as f:
    pickle.dump(bm25_results, f)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=144ec01f-394f-474f-b507-b786ab13b472' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>