In [90]:
import elasticsearch
import elasticsearch.helpers
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
import pandas as pd
import json
from sklearn.model_selection import GridSearchCV
es = elasticsearch.Elasticsearch('http://localhost:9200')

FEATURE_NAMES = [
    'tf_title', 'tf_abstract',
    'idf_title', 'idf_abstract',
    'mesh_term_matches',
]



In [91]:
basic_index_config = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "analyzer": {
                "mesh_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "stop",
                        "word_delimiter_graph"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "AB": {
                "type": "text",
                "analyzer": "standard"
            },
            "TI": {
                "type": "text",
                "analyzer": "standard"
            },
            "MH": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    },
                    "analyzed": {
                        "type": "text",
                        "analyzer": "mesh_analyzer"
                    }
                }
            },
            "PMID": {
                "type": "keyword"
            }
        }
    }
}

def read_documents(file_name):
    with open(file_name, 'r') as documents:
        for line in documents:
            doc_line = json.loads(line)
            if ('PMID' in doc_line):
                doc_line['_id'] = doc_line['PMID']
                if 'MH' in doc_line:
                    if isinstance(doc_line['MH'], str):
                        doc_line['MH'] = [doc_line['MH']]
                else:
                    doc_line['MH'] = []
                yield doc_line
            elif not ('index' in doc_line):
                raise ValueError('Woops, error in index file')

def create_index(es, index_name, body={}):
    es.indices.delete(index=index_name, ignore=[400, 404])
    es.indices.create(index=index_name, body=body)

def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    return elasticsearch.helpers.bulk(
        es,
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

try:
    result = index_documents(es, 'data/trec-medline.json', 'genomics-base', body=basic_index_config)
    print(f"Indexed {result[0]} documents")
except Exception as e:
    print(f"Error during indexing: {e}")

  es.indices.delete(index=index_name, ignore=[400, 404])
  return elasticsearch.helpers.bulk(


Indexed 525937 documents


In [79]:
def extract_features(es, query, doc_id):
    term_vectors = es.termvectors(
        index="genomics-base",
        id=doc_id,
        fields=["TI", "AB", "MH"],
        term_statistics=True,
        field_statistics=True,
        request_timeout=30
    )
    
    doc = es.get(index="genomics-base", id=doc_id)
    mesh_terms = doc['_source'].get('MH', [])
    
    bm25_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["TI", "AB"]
            }
        }
    }
    
    tf_title = 0
    tf_abstract = 0
    idf_title = 0
    idf_abstract = 0
    
    if 'term_vectors' in term_vectors:
        if 'TI' in term_vectors['term_vectors']:
            title_vectors = term_vectors['term_vectors']['TI']['terms']
            for term in query.lower().split():
                if term in title_vectors:
                    tf_title += title_vectors[term]['term_freq']
                    idf_title += np.log(1 + title_vectors[term]['doc_freq'])
                    
        if 'AB' in term_vectors['term_vectors']:
            abstract_vectors = term_vectors['term_vectors']['AB']['terms']
            for term in query.lower().split():
                if term in abstract_vectors:
                    tf_abstract += abstract_vectors[term]['term_freq']
                    idf_abstract += np.log(1 + abstract_vectors[term]['doc_freq'])
    
    query_terms = query.lower().split()
    mesh_term_matches = 0
    
    if mesh_terms:
        for term in query_terms:
            for mesh_term in mesh_terms:
                if term in mesh_term.lower():
                    mesh_term_matches += 1
        
      
    
    features = [
        tf_title, tf_abstract,
        idf_title, idf_abstract,
        mesh_term_matches,
    ]
    
    return features

In [80]:
def load_queries(queries_file):
    queries = {}
    with open(queries_file, 'r') as f:
        for line in f:
            qid, query = line.strip().split('\t')
            queries[qid] = query
    return queries

def load_qrels(qrels_file):
    qrels = []
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, doc_id, rel = line.strip().split()
            qrels.append({
                'qid': qid,
                'doc_id': doc_id,
                'relevance': int(rel)
            })
    return pd.DataFrame(qrels)

queries = load_queries('data/training-queries-simple.txt')
qrels_df = load_qrels('data/training-qrels.txt')

print(f"Loaded {len(queries)} queries and {len(qrels_df)} relevance judgments")



Loaded 50 queries and 335 relevance judgments


In [81]:
def prepare_training_data(es, queries, qrels_df, batch_size=50, negative_samples_per_query=5):
 
    training_data = []
    query_groups = qrels_df.groupby('qid')
    
    for qid, query in queries.items():
        try:
            if qid in query_groups.groups:
                positive_docs = query_groups.get_group(qid)
                
                for _, row in positive_docs.iterrows():
                    try:
                        features = extract_features(es, query, str(row['doc_id']))
                        training_data.append({
                            'features': features,
                            'relevance': 1.0,
                            'qid': qid,
                            'doc_id': row['doc_id'],
                            'query': query
                        })
                    except Exception as e:
                        continue
                
                search_results = es.search(
                    index="genomics-base",
                    body={
                        "query": {
                            "multi_match": {
                                "query": query,
                                "fields": ["TI", "AB"],
                                "type": "best_fields"
                            }
                        }
                    },
                    size=50
                )
                
                relevant_docs = set(positive_docs['doc_id'].astype(str))
                negative_count = 0
                
                for hit in search_results['hits']['hits']:
                    doc_id = hit['_id']
                    if doc_id not in relevant_docs and negative_count < negative_samples_per_query:
                        try:
                            features = extract_features(es, query, doc_id)
                            training_data.append({
                                'features': features,
                                'relevance': 0.0,
                                'qid': qid,
                                'doc_id': doc_id,
                                'query': query
                            })
                            negative_count += 1
                        except Exception as e:
                            continue
                            
        except Exception as e:
            continue
            
    return pd.DataFrame(training_data)

training_data = prepare_training_data(es, queries, qrels_df)
print(f"\nPrepared training data: {training_data.shape}")

  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.


Prepared training data: (547, 5)


  search_results = es.search(
  term_vectors = es.termvectors(


In [82]:
def train_and_validate_model(training_data, test_size=0.2):
    unique_queries = training_data['qid'].unique()
    train_queries, test_queries = train_test_split(unique_queries, test_size=test_size, random_state=42)
    
    train_mask = training_data['qid'].isin(train_queries)
    train_data = training_data[train_mask]
    test_data = training_data[~train_mask]
    
    X_train = np.array(train_data['features'].tolist())
    y_train = train_data['relevance'].values
    
    X_test = np.array(test_data['features'].tolist())
    y_test = test_data['relevance'].values
    
    param_grid = {
        'n_estimators': [300],
        'max_depth': [10, 15],
        'min_samples_split': [5],
        'min_samples_leaf': [2],
        'max_features': ['sqrt'],
        'bootstrap': [True],
        'max_samples': [0.8]
    }
    
    base_model = RandomForestRegressor(
        n_jobs=-1,
        criterion='squared_error',
    )
    
    def ranking_score(y_true, y_pred):
        true_ranks = pd.Series(y_true).rank(ascending=False)
        pred_ranks = pd.Series(y_pred).rank(ascending=False)
        return -np.mean(np.abs(true_ranks - pred_ranks))
    
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        scoring=make_scorer(ranking_score),
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_
    
    return model, (X_test, y_test), test_queries

model, (X_test, y_test), test_queries = train_and_validate_model(training_data)

X = np.array(training_data['features'].tolist())
y = training_data['relevance'].values

print("Training data shape:", X.shape)
print("\nFeature statistics:")
importances = pd.DataFrame({
    'feature': FEATURE_NAMES,
    'importance': model.feature_importances_
})
print("\nFeature importances:")
print(importances.sort_values('importance', ascending=False))

print("\nTarget variable statistics:")
print(f"Unique values in y: {np.unique(y)}")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Training data shape: (547, 5)

Feature statistics:

Feature importances:
             feature  importance
3       idf_abstract    0.287383
4  mesh_term_matches    0.250175
2          idf_title    0.205978
1        tf_abstract    0.178134
0           tf_title    0.078330

Target variable statistics:
Unique values in y: [0. 1.]
Mean: 0.54
Std: 0.50


In [83]:
def load_test_queries(file_path, selected_queries):
    queries = {}
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                qid, query = parts
                if qid not in selected_queries:
                    continue
                queries[qid] = query
    return queries

def get_initial_results(es, query, size=100):
    results = es.search(
        index="genomics-base",
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["TI", "AB"],
                    "type": "best_fields",
                    "tie_breaker": 0.3
                }
            }
        },
        size=size
    )
    return [(hit['_id'], hit['_score']) for hit in results['hits']['hits']]

def rerank_documents(es, model, query, doc_pairs, top_k=10):
    scores = []
    for doc_id, initial_score in doc_pairs:
        try:
            features = extract_features(es, query, doc_id)
            prediction = model.predict([features])[0]
            scores.append((doc_id, prediction))
        except Exception as e:
            continue
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

def generate_trec_run(es, model, queries, run_id="project", top_k=100):
    run_entries = []
    total_queries = len(queries)
        
    for i, (qid, query) in enumerate(queries.items(), 1):
        try:
            initial_results = get_initial_results(es, query)
            reranked_results = rerank_documents(es, model, query, initial_results, top_k)
            
            for rank, (doc_id, score) in enumerate(reranked_results, 1):
                run_entry = f"{qid} Q0 {doc_id} {rank} {score:.6f} {run_id}"
                run_entries.append(run_entry)
                
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue
            
    return "\n".join(run_entries)

def run_full_evaluation(es, model, test_queries, output_file="learning-to-rank.run"):
    test_queries = load_test_queries('data/training-queries-simple.txt', test_queries)
    run_content = generate_trec_run(es, model, test_queries)
    
    with open(output_file, "w") as f:
        f.write(run_content)
    return run_content

test_predictions = model.predict(X_test)

correlation = np.corrcoef(test_predictions, y_test)[0,1]
print(f"\nCorrelation between predictions and actual relevance: {correlation:.3f}")

run_content = run_full_evaluation(es, model, test_queries)



Correlation between predictions and actual relevance: 0.446


  results = es.search(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.

In [84]:
def generate_baseline_run(es, queries, run_id="baseline", top_k=100):
    run_entries = []
    total_queries = len(queries)
    
    print(f"Processing {total_queries} queries...")
    
    for i, (qid, query) in enumerate(queries.items(), 1):
        try:
            results = es.search(
                index="genomics-base",
                body={
                    "query": {
                        "multi_match": {
                            "query": query,
                            "fields": ["TI", "AB"],
                            "type": "best_fields",
                            "tie_breaker": 0.3
                        }
                    }
                },
                size=top_k
            )
            
            for rank, hit in enumerate(results['hits']['hits'], 1):
                doc_id = hit['_id']
                score = hit['_score']
                run_entry = f"{qid} Q0 {doc_id} {rank} {score:.6f} {run_id}"
                run_entries.append(run_entry)
                
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue
            
    return "\n".join(run_entries)

def run_baseline_evaluation(es, test_queries, output_file="baseline.run"):
    test_queries = load_test_queries('data/training-queries-simple.txt', test_queries)
    run_content = generate_baseline_run(es, test_queries)
    
    with open(output_file, "w") as f:
        f.write(run_content) 

run_baseline_evaluation(es, test_queries)

Processing 10 queries...


  results = es.search(


In [89]:
import numpy as np
from collections import defaultdict

def load_run_file(run_file):
    run_dict = defaultdict(list)
    with open(run_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            qid, _, doc_id, rank, score, _ = parts
            run_dict[qid].append((doc_id, float(score), int(rank)))
    return run_dict

def load_qrels(qrels_file): # Override to have dictoinary instead of dataframe like above in document
    qrels_dict = defaultdict(dict)
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, doc_id, rel = line.strip().split()
            qrels_dict[qid][doc_id] = int(rel)
    return qrels_dict

def precision_at_k(ranked_docs, relevant_docs, k):
    hits = sum(1 for doc_id, _, _ in ranked_docs[:k] if doc_id in relevant_docs)
    return hits / k if k > 0 else 0

def average_precision(ranked_docs, relevant_docs):
    precisions = []
    hits = 0
    
    for i, (doc_id, _, _) in enumerate(ranked_docs, 1):
        if doc_id in relevant_docs:
            hits += 1
            precisions.append(hits / i)
    
    return sum(precisions) / len(relevant_docs) if relevant_docs else 0

def ndcg_at_k(ranked_docs, relevance_dict, k):
    dcg = 0
    idcg = 0
    
    for i, (doc_id, _, _) in enumerate(ranked_docs[:k], 1):
        rel = relevance_dict.get(doc_id, 0)
        dcg += (2 ** rel - 1) / np.log2(i + 1)
    
    ideal_rels = sorted([rel for rel in relevance_dict.values()], reverse=True)
    for i, rel in enumerate(ideal_rels[:k], 1):
        idcg += (2 ** rel - 1) / np.log2(i + 1)
    
    return dcg / idcg if idcg > 0 else 0

def evaluate_run(run_dict, qrels_dict, metrics_at_k=[5, 10, 20, 50, 75, 100]):
    metrics = defaultdict(list)
    
    for qid in run_dict:
        if qid not in qrels_dict:
            continue

            
        ranked_docs = run_dict[qid]
        relevant_docs = {doc_id for doc_id, rel in qrels_dict[qid].items() if rel > 0}
        
        ap = average_precision(ranked_docs, relevant_docs)
        metrics['map'].append(ap)
        
        for k in metrics_at_k:
            p_at_k = precision_at_k(ranked_docs, relevant_docs, k)
            ndcg = ndcg_at_k(ranked_docs, qrels_dict[qid], k)
            
            metrics[f'P@{k}'].append(p_at_k)
            metrics[f'NDCG@{k}'].append(ndcg)
    
    results = {}
    for metric, values in metrics.items():
        results[metric] = np.mean(values)
    
    return results

def compare_runs(baseline_file, ltr_file, qrels_file, metrics_at_k=[5, 10, 20, 50, 75, 100]):
    baseline_run = load_run_file(baseline_file)
    ltr_run = load_run_file(ltr_file)
    qrels = load_qrels(qrels_file)
    
    baseline_metrics = evaluate_run(baseline_run, qrels, metrics_at_k)
    ltr_metrics = evaluate_run(ltr_run, qrels, metrics_at_k)
    
    print("Evaluation Results:")
    print("\nMetric      Baseline    LTR         Improvement")
    print("-" * 50)
    
    for metric in sorted(baseline_metrics.keys()):
        baseline_value = baseline_metrics[metric]
        ltr_value = ltr_metrics[metric]
        improvement = ((ltr_value - baseline_value) / baseline_value) * 100
        
        print(f"{metric:<11}    {baseline_value:.4f}   {ltr_value:.4f} {improvement:+.1f}%")

compare_runs(
    baseline_file="baseline.run",
    ltr_file="learning-to-rank.run",
    qrels_file="data/training-qrels.txt",
    metrics_at_k=[5, 10, 20, 50, 75, 100]
)

Evaluation Results:

Metric      Baseline    LTR         Improvement
--------------------------------------------------
NDCG@10        0.0871   0.0398 -54.3%
NDCG@100       0.1495   0.1111 -25.7%
NDCG@20        0.1050   0.0480 -54.3%
NDCG@5         0.0898   0.0339 -62.2%
NDCG@50        0.1274   0.0674 -47.1%
NDCG@75        0.1432   0.1043 -27.2%
P@10           0.0500   0.0200 -60.0%
P@100          0.0140   0.0140 +0.0%
P@20           0.0350   0.0150 -57.1%
P@5            0.0800   0.0200 -75.0%
P@50           0.0200   0.0120 -40.0%
P@75           0.0173   0.0160 -7.7%
map            0.0728   0.0329 -54.9%
