In [66]:
import elasticsearch
import elasticsearch.helpers
from elasticsearch.helpers import bulk
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import json
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
es = elasticsearch.Elasticsearch('http://localhost:9200')

FEATURE_NAMES = [
    'tf_title', 
    'tf_abstract',
    'idf_title',
    'idf_abstract',
    'bm25_combined',
    'bm25_title',
    'bm25_abstract',
    'query_length',
    'title_coverage',
    'abstract_coverage',
    'title_length',
    'abstract_length',
    'title_proximity',
    'abstract_proximity',
    'title_first_match',
    'abstract_first_match',
    'tf_title_normalized',
    'tf_abstract_normalized'
]

In [67]:
basic_index_config = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1
    },
    "mappings": {
        "properties": {
            "AB": {"type": "text"},
            "TI": {"type": "text"},
            "PMID": {"type": "keyword"},
        }
    }
}

def read_documents(file_name):
    with open(file_name, 'r') as documents:
        for line in documents:
            doc_line = json.loads(line)
            if ('PMID' in doc_line):
                doc_line['_id'] = doc_line['PMID']
                yield doc_line
            elif not ('index' in doc_line):
                raise ValueError('Woops, error in index file')

def create_index(es, index_name, body={}):
    # Delete index if it exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # Create new index
    es.indices.create(index=index_name, body=body)

def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    return elasticsearch.helpers.bulk(
        es,
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

try:
    result = index_documents(es, 'data/trec-medline.json', 'genomics-base', body=basic_index_config)
    print(f"Indexed {result[0]} documents")
except Exception as e:
    print(f"Error during indexing: {e}")

  es.indices.delete(index=index_name, ignore=[400, 404])
  return elasticsearch.helpers.bulk(


Indexed 525937 documents


In [68]:
def extract_features(es, query, doc_id):
    doc = es.get(index="genomics-base", id=doc_id)
    title = doc['_source'].get('TI', '')
    abstract = doc['_source'].get('AB', '')
    
    term_vectors = es.termvectors(
        index="genomics-base",
        id=doc_id,
        fields=["TI", "AB"],
        term_statistics=True,
        field_statistics=True
    )
    
    tf_title = 0
    tf_abstract = 0
    idf_title = 0
    idf_abstract = 0
    
    if 'term_vectors' in term_vectors:
        if 'TI' in term_vectors['term_vectors']:
            title_vectors = term_vectors['term_vectors']['TI']['terms']
            for term in query.lower().split():
                if term in title_vectors:
                    tf_title += title_vectors[term]['term_freq']
                    idf_title += np.log(1 + title_vectors[term]['doc_freq'])
                    
        if 'AB' in term_vectors['term_vectors']:
            abstract_vectors = term_vectors['term_vectors']['AB']['terms']
            for term in query.lower().split():
                if term in abstract_vectors:
                    tf_abstract += abstract_vectors[term]['term_freq']
                    idf_abstract += np.log(1 + abstract_vectors[term]['doc_freq'])
    
    bm25_title = es.search(
        index="genomics-base",
        body={
            "query": {
                "match": {
                    "TI": query
                }
            }
        },
        size=1
    )['hits']['max_score'] or 0

    bm25_abstract = es.search(
        index="genomics-base",
        body={
            "query": {
                "match": {
                    "AB": query
                }
            }
        },
        size=1
    )['hits']['max_score'] or 0

    query_terms = query.lower().split()
    query_length = len(query_terms)
    
    title_terms = set(title.lower().split())
    abstract_terms = set(abstract.lower().split())
    query_terms_set = set(query_terms)
    
    title_coverage = len(query_terms_set.intersection(title_terms)) / len(query_terms_set)
    abstract_coverage = len(query_terms_set.intersection(abstract_terms)) / len(query_terms_set)
    
    title_length = len(title.split())
    abstract_length = len(abstract.split())
    
    def compute_min_distance(text, query_terms):
        words = text.lower().split()
        min_distance = float('inf')
        
        term_positions = defaultdict(list)
        for i, word in enumerate(words):
            if word in query_terms:
                term_positions[word].append(i)
        
        if len(term_positions) < 2:
            return 0  # If less than 2 query terms found
            
        for term1 in term_positions:
            for term2 in term_positions:
                if term1 != term2:
                    for pos1 in term_positions[term1]:
                        for pos2 in term_positions[term2]:
                            distance = abs(pos1 - pos2)
                            min_distance = min(min_distance, distance)
                            
        return min_distance if min_distance != float('inf') else 0
    
    title_proximity = compute_min_distance(title, query_terms_set)
    abstract_proximity = compute_min_distance(abstract, query_terms_set)
    
    def first_match_position(text, query_terms):
        words = text.lower().split()
        for i, word in enumerate(words):
            if word in query_terms:
                return i / len(words)  # Normalize by text length
        return 1.0  # No match found
    
    title_first_match = first_match_position(title, query_terms_set)
    abstract_first_match = first_match_position(abstract, query_terms_set)
    
    # Combine all features
    features = [
        tf_title, tf_abstract,
        idf_title, idf_abstract,
        bm25_title + bm25_abstract,        
        bm25_title,
        bm25_abstract,
        query_length,
        title_coverage,
        abstract_coverage,
        title_length,
        abstract_length,
        title_proximity,
        abstract_proximity,
        title_first_match,
        abstract_first_match,
        tf_title / (title_length + 1),
        tf_abstract / (abstract_length + 1)
    ]
    
    return features

In [69]:
def load_queries(queries_file):
    queries = {}
    with open(queries_file, 'r') as f:
        for line in f:
            qid, query = line.strip().split('\t')
            queries[qid] = query
    return queries

def load_qrels(qrels_file):
    qrels = []
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, doc_id, rel = line.strip().split()
            qrels.append({
                'qid': qid,
                'doc_id': doc_id,
                'relevance': int(rel)
            })
    return pd.DataFrame(qrels)

queries = load_queries('data/training-queries-simple.txt')
qrels_df = load_qrels('data/training-qrels.txt')

print(f"Loaded {len(queries)} queries and {len(qrels_df)} relevance judgments")



Loaded 50 queries and 335 relevance judgments


In [70]:
def prepare_training_data(es, queries, qrels_df, batch_size=50, negative_positive_ratio=1.0):
    training_data = []
    
    for i in range(0, len(qrels_df), batch_size):
        batch = qrels_df.iloc[i:i+batch_size]
        
        for idx, row in batch.iterrows():
            if row['qid'] in queries:
                try:
                    features = extract_features(
                        es,
                        queries[row['qid']], 
                        str(row['doc_id'])
                    )
                    training_data.append({
                        'features': features,
                        'relevance': 1,
                        'qid': row['qid'],
                        'doc_id': row['doc_id']
                    })
                except Exception as e:
                    print(f"Error processing document {row['doc_id']}: {str(e)}")
                    continue
                    
        print(f"Processed {len(training_data)} positive documents so far...")
    
    positive_examples_per_query = qrels_df.groupby('qid').size()
    avg_positives = positive_examples_per_query.mean()
    n_negative_samples = int(avg_positives * negative_positive_ratio)
    
    print(f"\nAverage positive examples per query: {avg_positives:.1f}")
    print(f"Using {n_negative_samples} negative examples per query")
    
    for qid, query in queries.items():
        try:
            search_results = es.search(
                index="genomics-base",
                body={
                    "query": {
                        "function_score": {
                            "query": {"match_all": {}},
                            "random_score": {}
                        }
                    }
                },
                size=n_negative_samples
            )
            
            positive_docs = set(qrels_df[qrels_df['qid'] == qid]['doc_id'].astype(str))
            for hit in search_results['hits']['hits']:
                doc_id = hit['_id']
                if doc_id not in positive_docs:
                    try:
                        features = extract_features(es, query, doc_id)
                        training_data.append({
                            'features': features,
                            'relevance': 0,
                            'qid': qid,
                            'doc_id': doc_id
                        })
                    except Exception as e:
                        continue
                        
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue            
    
    return pd.DataFrame(training_data)

training_data = prepare_training_data(es, queries, qrels_df)
print(f"\nPrepared training data: {training_data.shape}")

Error processing document 11642719: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11642719', 'found': False}")


  bm25_title = es.search(


Error processing document 11695244: Connection timed out
Error processing document 11700040: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11700040', 'found': False}")
Error processing document 11733969: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11733969', 'found': False}")
Error processing document 11741909: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11741909', 'found': False}")
Error processing document 11748297: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11748297', 'found': False}")
Error processing document 11751405: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11751405', 'found': False}")


  bm25_abstract = es.search(


Error processing document 11781193: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11781193', 'found': False}")
Error processing document 11809712: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11809712', 'found': False}")
Error processing document 11842244: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11842244', 'found': False}")
Processed 40 positive documents so far...
Error processing document 11731410: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11731410', 'found': False}")
Error processing document 11580237: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11580237', 'found': False}")
Error processing document 11781706: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11781706', 'found': False}")
Processed 87 positive documents so far...
Error processing document 11686318: NotFoundError(404, "{'_index': 'genomics-base', '_id': '11686318', 'found': False}")
Error processing document 11685227: NotFoundError(404, "{'_index': 'g

  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  search_results = es.search(
  bm25_title = es.search(
  bm25_ab


Prepared training data: (596, 4)


In [76]:
def train_and_validate_model(training_data, feature_names, test_size=0.2):
    unique_queries = training_data['qid'].unique()
    train_queries, test_queries = train_test_split(
        unique_queries, 
        test_size=test_size,
    )
    
    # Split data based on queries
    train_mask = training_data['qid'].isin(train_queries)
    train_data = training_data[train_mask]
    test_data = training_data[~train_mask]
    
    # Prepare features and labels
    X_train = np.array(train_data['features'].tolist())
    y_train = train_data['relevance'].values
    
    X_test = np.array(test_data['features'].tolist())
    y_test = test_data['relevance'].values
    
    # Print shapes for debugging
    print(f"Training data shape: {X_train.shape}")
    print(f"Number of features: {len(feature_names)}")
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [200, 300, 400],
        'max_depth': [15, 20, 25],      
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],  
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True],
        'max_samples': [0.7, 0.8, 0.9]}
    
    base_model = RandomForestRegressor(random_state=42)
    
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,                     
        n_jobs=-1,               
        scoring='neg_mean_squared_error',
        verbose=2
    )
    
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_
    print(grid_search.best_params_)
    
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    print("\nModel Performance:")
    print(f"Training R² score: {train_score:.3f}")
    print(f"Test R² score: {test_score:.3f}")
    
    importances = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(importances.head(10))
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print("\nDetailed Metrics:")
    print("Train MSE:", mean_squared_error(y_train, y_pred_train))
    print("Test MSE:", mean_squared_error(y_test, y_pred_test))
    print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
    print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
    
    return model, (X_test, y_test), test_queries

model, (X_test, y_test), test_queries = train_and_validate_model(training_data, FEATURE_NAMES)

X = np.array(training_data['features'].tolist())
y = training_data['relevance'].values

print("Training data shape:", X.shape)
print("\nFeature statistics:")
for i, feature_name in enumerate(FEATURE_NAMES):
    print(f"{feature_name}:")
    print(f"  Mean: {X[:, i].mean():.2f}")
    print(f"  Std: {X[:, i].std():.2f}")
    print(f"  Max: {X[:, i].max():.2f}")
    print(f"  Min: {X[:, i].min():.2f}")
    print()

print("\nTarget variable statistics:")
print(f"Unique values in y: {np.unique(y)}")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")

Training data shape: (462, 18)
Number of features: 18
Starting GridSearchCV...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=15, max_features=sqrt, max_samples=0.7, min_samples_lea

  _data = np.array(data, dtype=dtype, copy=copy,



Best parameters found:
{'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'max_samples': 0.7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}

Model Performance:
Training R² score: 0.721
Test R² score: 0.411

Top 10 Most Important Features:
                   feature  importance
3             idf_abstract    0.132199
17  tf_abstract_normalized    0.125570
13      abstract_proximity    0.123375
5               bm25_title    0.071086
9        abstract_coverage    0.067891
4            bm25_combined    0.066392
1              tf_abstract    0.065147
6            bm25_abstract    0.051146
11         abstract_length    0.046797
10            title_length    0.042919

Detailed Metrics:
Train MSE: 0.06976068604261863
Test MSE: 0.14570103610552163
Train MAE: 0.19471796574990805
Test MAE: 0.2923701533139088
Training data shape: (596, 18)

Feature statistics:
tf_title:
  Mean: 1.02
  Std: 1.19
  Max: 7.00
  Min: 0.00

tf_abstract:
  Mean: 7.07
  Std: 6.70
  Max: 43

In [78]:
def load_test_queries(file_path, selected_queries):
    queries = {}
    with open(file_path, 'r') as f:
        for line in f:
            # Split on first tab only
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                qid, query = parts
                if qid not in selected_queries:
                    continue
                queries[qid] = query
    return queries

def get_initial_results(es, query, size=100):
    results = es.search(
        index="genomics-base",
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["TI", "AB"],
                    "type": "best_fields",
                    "tie_breaker": 0.3
                }
            }
        },
        size=size
    )
    return [(hit['_id'], hit['_score']) for hit in results['hits']['hits']]

def rerank_documents(es, model, query, doc_pairs, top_k=10):
    scores = []
    for doc_id, initial_score in doc_pairs:
        try:
            features = extract_features(es, query, doc_id)
            prediction = model.predict([features])[0]
            # No scaling by 50 - use raw prediction
            scores.append((doc_id, prediction))
        except Exception as e:
            continue
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

def generate_trec_run(es, model, queries, run_id="project", top_k=100):
    run_entries = []
    total_queries = len(queries)
        
    for i, (qid, query) in enumerate(queries.items(), 1):
        try:
            initial_results = get_initial_results(es, query)
            reranked_results = rerank_documents(es, model, query, initial_results, top_k)
            
            for rank, (doc_id, score) in enumerate(reranked_results, 1):
                run_entry = f"{qid} Q0 {doc_id} {rank} {score:.6f} {run_id}"
                run_entries.append(run_entry)
                
            # Progress update
            if i % 10 == 0:
                print(f"Processed {i}/{total_queries} queries...")
                
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue
            
    return "\n".join(run_entries)

def run_full_evaluation(es, model, test_queries, output_file="learning-to-rank.run"):
    test_queries = load_test_queries('data/training-queries-simple.txt', test_queries)
    run_content = generate_trec_run(es, model, test_queries)
    
    with open(output_file, "w") as f:
        f.write(run_content)
    return run_content

test_predictions = model.predict(X_test)

correlation = np.corrcoef(test_predictions, y_test)[0,1]
print(f"\nCorrelation between predictions and actual relevance: {correlation:.3f}")

run_content = run_full_evaluation(es, model, test_queries)


Training data shape: (462, 18)
Number of features: 18
Starting GridSearchCV...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits

[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.6s
[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=25, max_features=sqrt, max_samples=0.9, min_samples_lea

  _data = np.array(data, dtype=dtype, copy=copy,



Best parameters found:
{'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'max_samples': 0.7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}

Model Performance:
Training R² score: 0.721
Test R² score: 0.411

Top 10 Most Important Features:
                   feature  importance
3             idf_abstract    0.132199
17  tf_abstract_normalized    0.125570
13      abstract_proximity    0.123375
5               bm25_title    0.071086
9        abstract_coverage    0.067891
4            bm25_combined    0.066392
1              tf_abstract    0.065147
6            bm25_abstract    0.051146
11         abstract_length    0.046797
10            title_length    0.042919

Detailed Metrics:
Train MSE: 0.06976068604261863
Test MSE: 0.14570103610552163
Train MAE: 0.19471796574990805
Test MAE: 0.2923701533139088

Correlation between predictions and actual relevance: 0.660
Processing 10 queries...


  results = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract = es.search(
  bm25_title = es.search(
  bm25_abstract 

Processed 10/10 queries...


  bm25_title = es.search(
  bm25_abstract = es.search(


In [79]:
def generate_baseline_run(es, queries, run_id="baseline", top_k=100):
    run_entries = []
    total_queries = len(queries)
    
    print(f"Processing {total_queries} queries...")
    
    for i, (qid, query) in enumerate(queries.items(), 1):
        try:
            results = es.search(
                index="genomics-base",
                body={
                    "query": {
                        "multi_match": {
                            "query": query,
                            "fields": ["TI", "AB"],
                            "type": "best_fields",
                            "tie_breaker": 0.3
                        }
                    }
                },
                size=top_k
            )
            
            # Format as TREC run entries, using the original BM25 scores
            for rank, hit in enumerate(results['hits']['hits'], 1):
                doc_id = hit['_id']
                score = hit['_score']
                run_entry = f"{qid} Q0 {doc_id} {rank} {score:.6f} {run_id}"
                run_entries.append(run_entry)
                
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue
            
    return "\n".join(run_entries)

def run_baseline_evaluation(es, test_queries, output_file="baseline.run"):
    test_queries = load_test_queries('data/training-queries-simple.txt', test_queries)
    run_content = generate_baseline_run(es, test_queries)
    
    with open(output_file, "w") as f:
        f.write(run_content) 
    
    return run_content

run_baseline_evaluation(es, test_queries)

Processing 10 queries...


  results = es.search(


'14 Q0 11856731 1 47.235554 baseline\n14 Q0 11927599 2 43.825170 baseline\n14 Q0 12061551 3 43.315690 baseline\n14 Q0 12450387 4 41.689420 baseline\n14 Q0 11956181 5 41.479683 baseline\n14 Q0 12005430 6 39.694050 baseline\n14 Q0 12016229 7 37.560196 baseline\n14 Q0 11854463 8 36.678844 baseline\n14 Q0 12011422 9 34.958190 baseline\n14 Q0 12110594 10 34.537457 baseline\n14 Q0 11773626 11 34.443890 baseline\n14 Q0 12244330 12 34.160892 baseline\n14 Q0 12427973 13 32.535614 baseline\n14 Q0 11904177 14 30.395567 baseline\n14 Q0 12130658 15 29.079330 baseline\n14 Q0 12235211 16 28.537498 baseline\n14 Q0 11732605 17 28.485031 baseline\n14 Q0 12167152 18 28.355800 baseline\n14 Q0 12417586 19 27.774563 baseline\n14 Q0 12359246 20 27.669653 baseline\n14 Q0 12097643 21 27.531502 baseline\n14 Q0 11923317 22 26.828144 baseline\n14 Q0 12204379 23 26.660847 baseline\n14 Q0 12409460 24 26.621017 baseline\n14 Q0 12322895 25 26.389320 baseline\n14 Q0 12038980 26 26.358171 baseline\n14 Q0 12438642 27 26

In [80]:
import numpy as np
from collections import defaultdict

def load_run_file(run_file):
    run_dict = defaultdict(list)
    with open(run_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            qid, _, doc_id, rank, score, _ = parts
            run_dict[qid].append((doc_id, float(score), int(rank)))
    return run_dict

def load_qrels(qrels_file): # Override to have dictoinary instead of dataframe like above in document
    qrels_dict = defaultdict(dict)
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, doc_id, rel = line.strip().split()
            qrels_dict[qid][doc_id] = int(rel)
    return qrels_dict

def precision_at_k(ranked_docs, relevant_docs, k):
    hits = sum(1 for doc_id, _, _ in ranked_docs[:k] if doc_id in relevant_docs)
    return hits / k if k > 0 else 0

def average_precision(ranked_docs, relevant_docs):
    precisions = []
    hits = 0
    
    for i, (doc_id, _, _) in enumerate(ranked_docs, 1):
        if doc_id in relevant_docs:
            hits += 1
            precisions.append(hits / i)
    
    return sum(precisions) / len(relevant_docs) if relevant_docs else 0

def ndcg_at_k(ranked_docs, relevance_dict, k):
    dcg = 0
    idcg = 0
    
    for i, (doc_id, _, _) in enumerate(ranked_docs[:k], 1):
        rel = relevance_dict.get(doc_id, 0)
        dcg += (2 ** rel - 1) / np.log2(i + 1)
    
    ideal_rels = sorted([rel for rel in relevance_dict.values()], reverse=True)
    for i, rel in enumerate(ideal_rels[:k], 1):
        idcg += (2 ** rel - 1) / np.log2(i + 1)
    
    return dcg / idcg if idcg > 0 else 0

def evaluate_run(run_dict, qrels_dict, metrics_at_k=[5, 10, 20]):
    metrics = defaultdict(list)
    
    for qid in run_dict:
        if qid not in qrels_dict:
            continue

            
        ranked_docs = run_dict[qid]
        relevant_docs = {doc_id for doc_id, rel in qrels_dict[qid].items() if rel > 0}
        
        ap = average_precision(ranked_docs, relevant_docs)
        metrics['map'].append(ap)
        
        for k in metrics_at_k:
            p_at_k = precision_at_k(ranked_docs, relevant_docs, k)
            ndcg = ndcg_at_k(ranked_docs, qrels_dict[qid], k)
            
            metrics[f'P@{k}'].append(p_at_k)
            metrics[f'NDCG@{k}'].append(ndcg)
    
    results = {}
    for metric, values in metrics.items():
        results[metric] = np.mean(values)
    
    return results

def compare_runs(baseline_file, ltr_file, qrels_file, metrics_at_k=[5, 10, 20]):
    baseline_run = load_run_file(baseline_file)
    ltr_run = load_run_file(ltr_file)
    qrels = load_qrels(qrels_file)
    
    baseline_metrics = evaluate_run(baseline_run, qrels, metrics_at_k)
    ltr_metrics = evaluate_run(ltr_run, qrels, metrics_at_k)
    
    print("Evaluation Results:")
    print("\nMetric      Baseline    LTR         Improvement")
    print("-" * 50)
    
    for metric in sorted(baseline_metrics.keys()):
        baseline_value = baseline_metrics[metric]
        ltr_value = ltr_metrics[metric]
        improvement = ((ltr_value - baseline_value) / baseline_value) * 100
        
        print(f"{metric:<11}    {baseline_value:.4f}   {ltr_value:.4f} {improvement:+.1f}%")

compare_runs(
    baseline_file="baseline.run",
    ltr_file="learning-to-rank.run",
    qrels_file="data/training-qrels.txt",
    metrics_at_k=[5, 10, 20]
)

Evaluation Results:

Metric      Baseline    LTR         Improvement
--------------------------------------------------
NDCG@10        0.1444   0.0729 -49.5%
NDCG@20        0.1524   0.0848 -44.3%
NDCG@5         0.1384   0.0767 -44.6%
P@10           0.0900   0.0400 -55.6%
P@20           0.0500   0.0300 -40.0%
P@5            0.1200   0.0600 -50.0%
map            0.1169   0.0558 -52.2%
