In [6]:
import elasticsearch
import elasticsearch.helpers
from elasticsearch.helpers import bulk
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
import json
es = elasticsearch.Elasticsearch('http://localhost:9200')

FEATURE_NAMES = [
    'tf_title', 'tf_abstract',
    'idf_title', 'idf_abstract',
    'title_length', 'abstract_length',
    'bm25_score'
]

In [7]:
basic_index_config = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1
    },
    "mappings": {
        "properties": {
            "AB": {"type": "text"},
            "TI": {"type": "text"},
            "PMID": {"type": "keyword"},
        }
    }
}

def read_documents(file_name):
    with open(file_name, 'r') as documents:
        for line in documents:
            doc_line = json.loads(line)
            if ('PMID' in doc_line):
                doc_line['_id'] = doc_line['PMID']
                yield doc_line
            elif not ('index' in doc_line):
                raise ValueError('Woops, error in index file')

def create_index(es, index_name, body={}):
    # Delete index if it exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # Create new index
    es.indices.create(index=index_name, body=body)

def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    return elasticsearch.helpers.bulk(
        es,
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

try:
    result = index_documents(es, 'data/trec-medline.json', 'genomics-base', body=basic_index_config)
    print(f"Indexed {result[0]} documents")
except Exception as e:
    print(f"Error during indexing: {e}")

  es.indices.delete(index=index_name, ignore=[400, 404])
  return elasticsearch.helpers.bulk(


Indexed 263080 documents


In [18]:
def extract_features(es, query, doc_id):
    term_vectors = es.termvectors(
        index="genomics-base",
        id=doc_id,
        fields=["TI", "AB"],
        term_statistics=True,
        field_statistics=True,
        request_timeout=30
    )
    
    bm25_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["TI", "AB"]
            }
        }
    }
    
    bm25_score = es.explain(
        index="genomics-base",
        id=doc_id,
        body=bm25_query
    )["explanation"]["value"]
    
    tf_title = 0
    tf_abstract = 0
    idf_title = 0
    idf_abstract = 0
    
    if 'term_vectors' in term_vectors:
        if 'TI' in term_vectors['term_vectors']:
            title_vectors = term_vectors['term_vectors']['TI']['terms']
            for term in query.lower().split():
                if term in title_vectors:
                    tf_title += title_vectors[term]['term_freq']
                    idf_title += np.log(1 + title_vectors[term]['doc_freq'])
                    
        if 'AB' in term_vectors['term_vectors']:
            abstract_vectors = term_vectors['term_vectors']['AB']['terms']
            for term in query.lower().split():
                if term in abstract_vectors:
                    tf_abstract += abstract_vectors[term]['term_freq']
                    idf_abstract += np.log(1 + abstract_vectors[term]['doc_freq'])
    
    title_length = len(term_vectors.get('term_vectors', {}).get('TI', {}).get('terms', {}))
    abstract_length = len(term_vectors.get('term_vectors', {}).get('AB', {}).get('terms', {}))
    
    features = [
        tf_title, tf_abstract,
        idf_title, idf_abstract,
        title_length, abstract_length,
        bm25_score
    ]
    
    return features

In [19]:
def load_queries(queries_file):
    queries = {}
    with open(queries_file, 'r') as f:
        for line in f:
            qid, query = line.strip().split('\t')
            queries[qid] = query
    return queries

def load_qrels(qrels_file):
    qrels = []
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, doc_id, rel = line.strip().split()
            qrels.append({
                'qid': qid,
                'doc_id': doc_id,
                'relevance': int(rel)
            })
    return pd.DataFrame(qrels)

queries = load_queries('data/training-queries-simple.txt')
qrels_df = load_qrels('data/training-qrels.txt')

print(f"Loaded {len(queries)} queries and {len(qrels_df)} relevance judgments")



Loaded 38 queries and 144 relevance judgments


In [29]:
def prepare_training_data(es, queries, qrels_df, batch_size=50, negative_positive_ratio=2.0):
    training_data = []
    
    for i in range(0, len(qrels_df), batch_size):
        batch = qrels_df.iloc[i:i+batch_size]
        
        for idx, row in batch.iterrows():
            if row['qid'] in queries:
                try:
                    features = extract_features(
                        es,
                        queries[row['qid']], 
                        str(row['doc_id'])
                    )
                    training_data.append({
                        'features': features,
                        'relevance': 1,
                        'qid': row['qid'],
                        'doc_id': row['doc_id']
                    })
                except Exception as e:
                    print(f"Error processing document {row['doc_id']}: {str(e)}")
                    continue
                    
        print(f"Processed {len(training_data)} positive documents so far...")
    
    positive_examples_per_query = qrels_df.groupby('qid').size()
    avg_positives = positive_examples_per_query.mean()
    n_negative_samples = int(avg_positives * negative_positive_ratio)
    
    print(f"\nAverage positive examples per query: {avg_positives:.1f}")
    print(f"Using {n_negative_samples} negative examples per query to achieve {negative_positive_ratio}:1 ratio")
    
    for qid, query in queries.items():
        try:
            # Get random documents that aren't in the qrels
            search_results = es.search(
                index="genomics-base",
                body={
                    "query": {
                        "function_score": {
                            "query": {"match_all": {}},
                            "random_score": {}
                        }
                    }
                },
                size=n_negative_samples
            )
            
            # Add negative examples to balance the dataset
            positive_docs = set(qrels_df[qrels_df['qid'] == qid]['doc_id'].astype(str))
            for hit in search_results['hits']['hits']:
                doc_id = hit['_id']
                if doc_id not in positive_docs:
                    try:
                        features = extract_features(es, query, doc_id)
                        training_data.append({
                            'features': features,
                            'relevance': 0,  # Negative example
                            'qid': qid,
                            'doc_id': doc_id
                        })
                    except Exception as e:
                        continue
                        
        except Exception as e:
            print(f"Error processing query {qid}: {str(e)}")
            continue            
    return pd.DataFrame(training_data)

training_data = prepare_training_data(es, queries, qrels_df)
print(f"\nPrepared training data: {training_data.shape}")

  term_vectors = es.termvectors(


Processed 50 positive documents so far...
Processed 100 positive documents so far...
Processed 144 positive documents so far...

Average positive examples per query: 3.8
Using 7 negative examples per query to achieve 2.0:1 ratio


  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termvectors(
  search_results = es.search(
  term_vectors = es.termv


Prepared training data: (410, 4)


In [31]:
# Load test queries
test_queries = {}
with open('data/test-queries-simple.txt', 'r') as f:
    for line in f:
        qid, query = line.strip().split('\t')
        test_queries[qid] = query

X = np.array(training_data['features'].tolist())
y = training_data['relevance'].values

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

model.fit(X, y)

importances = pd.DataFrame({
    'feature': FEATURE_NAMES,
    'importance': model.feature_importances_
})
print("\nFeature importances:")
print(importances.sort_values('importance', ascending=False))

print(f"\nLoaded {len(test_queries)} test queries for evaluation")


Feature importances:
           feature  importance
6       bm25_score    0.832993
5  abstract_length    0.061262
4     title_length    0.046657
1      tf_abstract    0.025574
3     idf_abstract    0.016077
0         tf_title    0.009407
2        idf_title    0.008030

Loaded 50 test queries for evaluation


In [32]:
# Check training data statistics
print("Training data shape:", X.shape)
print("\nFeature statistics:")
for i, feature_name in enumerate(FEATURE_NAMES):
    print(f"{feature_name}:")
    print(f"  Mean: {X[:, i].mean():.2f}")
    print(f"  Std: {X[:, i].std():.2f}")
    print(f"  Max: {X[:, i].max():.2f}")
    print(f"  Min: {X[:, i].min():.2f}")
    print()

# Check if we have any variation in the target variable
print("\nTarget variable statistics:")
print(f"Unique values in y: {np.unique(y)}")
print(f"Mean: {y.mean():.2f}")
print(f"Std: {y.std():.2f}")

Training data shape: (410, 7)

Feature statistics:
tf_title:
  Mean: 0.90
  Std: 1.10
  Max: 7.00
  Min: 0.00

tf_abstract:
  Mean: 6.54
  Std: 6.90
  Max: 43.00
  Min: 0.00

idf_title:
  Mean: 8.11
  Std: 8.54
  Max: 52.06
  Min: 0.00

idf_abstract:
  Mean: 14.99
  Std: 12.67
  Max: 78.20
  Min: 0.00

title_length:
  Mean: 12.78
  Std: 5.09
  Max: 32.00
  Min: 1.00

abstract_length:
  Mean: 87.93
  Std: 52.38
  Max: 220.00
  Min: 0.00

bm25_score:
  Mean: 5.92
  Std: 8.92
  Max: 43.36
  Min: 0.00


Target variable statistics:
Unique values in y: [0 1]
Mean: 0.35
Std: 0.48


In [34]:
def get_initial_results(es, query, size=100):
    results = es.search(
        index="genomics-base",
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["TI", "AB"],
                    "type": "best_fields",
                    "tie_breaker": 0.3
                }
            }
        },
        size=size
    )
    return [hit['_id'] for hit in results['hits']['hits']]

def rerank_documents(es, model, query, doc_ids, top_k=10):
    scores = []
    for doc_id in doc_ids:
        features = extract_features(es, query, doc_id)
        score = model.predict([features])[0]
        scores.append((doc_id, score))
    
    # Sort by score in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

In [35]:
test_query = "molecule structure"
initial_doc_ids = get_initial_results(es, test_query)
reranked_results = rerank_documents(es, model, test_query, initial_doc_ids)

print(f'Top 10 reranked results for query: {test_query}')
for doc_id, score in reranked_results:
    print(f"Doc ID: {doc_id}, Score: {score:.4f}")

  results = es.search(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.

Top 10 reranked results for query: molecule structure
Doc ID: 11732694, Score: 1.0000
Doc ID: 11718323, Score: 1.0000
Doc ID: 12149555, Score: 1.0000
Doc ID: 11851330, Score: 1.0000
Doc ID: 11807249, Score: 1.0000
Doc ID: 12357800, Score: 1.0000
Doc ID: 12136142, Score: 1.0000
Doc ID: 12502353, Score: 1.0000
Doc ID: 12175539, Score: 1.0000
Doc ID: 12475232, Score: 1.0000


  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
  term_vectors = es.termvectors(
