In [1]:
from elasticsearch import Elasticsearch    # elasticsearch will extract features from query-document pairs for us
from elasticsearch.helpers import bulk, parallel_bulk
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import ir_measures
from ir_measures import *
import pandas as pd
import numpy as np
import json
import requests
import re
import random
from tqdm import tqdm
from time import time


### Connection

In [2]:
es = Elasticsearch('http://localhost:9200')


### Index Configuration

In [3]:
index_name = 'wiki'


In [4]:
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'}

### WikiIR Collection

In [5]:
df = pd.read_csv('wikIR1k/documents.csv')

print(df.shape)
df.head()


(369721, 2)


Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...


### Indexing documents

In [6]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|██████████████████████████████| 369721/369721 [00:27<00:00, 13438.43it/s]  


Indexing time: 27.649768114089966


[{'epoch': '1678134817', 'timestamp': '20:33:37', 'count': '369721'}]

### Train, Test queries

In [7]:
train_queries = pd.read_csv('wikIR1k/training/queries.csv')
train_queries

Unnamed: 0,id_left,text_left
0,123839,yanni
1,188629,k pop
2,13898,venice film festival
3,316959,downtown brooklyn
4,515031,pennsylvania house of representatives
...,...,...
1439,896124,british ceylon
1440,12319,scottish national party
1441,4421,cinema of china
1442,296526,gold mining


In [8]:
test_queries = pd.read_csv('wikIR1k/test/queries.csv')
test_queries

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [9]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=100), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']


In [10]:
def make_query(text):
    return {
        "bool": {
            'must': {
                'match': {
                    'text': text
                }                    
            },
            'should': {
                "match_phrase": {
                    "text": {
                        "query": text,
                        'slop': 10,
                    }
                }
            }
        }
    }

search(make_query(train_queries['text_left'][4]))


Total documents: 10000
Doc 887826, score is 39.313705
Doc 1176215, score is 37.12873
Doc 1376470, score is 35.247803
Doc 1569953, score is 34.36048
Doc 1842084, score is 34.359726
Doc 2369067, score is 33.912014
Doc 1233841, score is 33.671608
Doc 1382041, score is 33.65744
Doc 832086, score is 33.563896
Doc 1481340, score is 33.388405
Doc 422337, score is 33.386597
Doc 1315785, score is 33.376156
Doc 1481358, score is 33.144707
Doc 1481362, score is 33.026207
Doc 1449050, score is 32.510365
Doc 1125994, score is 31.858921
Doc 1337630, score is 31.466238
Doc 1376157, score is 31.149237
Doc 888195, score is 30.324703
Doc 854813, score is 29.80125
Doc 843996, score is 29.800423
Doc 837727, score is 29.799664
Doc 843406, score is 29.19878
Doc 1135269, score is 29.196968
Doc 2223594, score is 28.857979
Doc 836749, score is 28.844719
Doc 1283126, score is 27.984835
Doc 1275795, score is 27.766108
Doc 823209, score is 27.725216
Doc 2120091, score is 27.722628
Doc 1453940, score is 27.695242


### Feature extraction function

In [11]:
def extract_features(index_name, query_text, doc_id, verbose=False):
    
    # feature: query length
    query_len = len(query_text.split(' '))
    # feature: doc length
    doc_len = len(get_doc_by_id(doc_id)['text'].split(' '))


    # request with explain parameter
    headers = {
        'Content-type': 'application/json',
        'Accept': 'application/json',
    }
    json_data = {
        'query': make_query(query_text)
    }    
    res = requests.get(f'http://127.0.0.1:9200/wiki/_explain/{doc_id}', headers=headers, json=json_data).json()

    if verbose:
        print(json.dumps(res, indent=2))
    
    total_score = 0
    idfs = [0]
    tfs = [0]
    num_matched_terms = 0
    phrase_freq = 0
    phrase_match = 0
    
    if res['matched']:
        # BM25 score
        total_score = res['explanation']['value']
        
        terms_details = res['explanation']['details'][0]['details']
        freqs = []
        tfs = []
        idfs = []
        for term in terms_details:
            if len(set(query_text.split(' ')))==1:
                # features: terms' frequencies
                m = re.search('freq=(\d+)', term['description'])
                freqs.append(int(m.group(1)))
                # features: terms' idfs
                idfs.append(term['details'][1]['value'])
                # features: terms' tfs
                tfs.append(term['details'][2]['value'])
            else:
                m = re.search('freq=(\d+)', term['details'][0]['description'])
                freqs.append(int(m.group(1)))
                idfs.append(term['details'][0]['details'][1]['value'])
                tfs.append(term['details'][0]['details'][2]['value'])
            
        num_matched_terms = np.count_nonzero(freqs)
        
        # if the response includes the second detail info, it means that the phrase matched as well
        # (slop parameter might be included above, so phrase matches may not be strict)
        if len(res['explanation']['details']) > 1:
            phrase_details = res['explanation']['details'][1]
            m = re.search('freq=(\d+.\d+)', phrase_details['details'][0]['description'])
            phrase_freq = float(m.group(1))
            phrase_match = 1 if float(m.group(1))>1 else 0


    return {'bm25 score': total_score,
            'query length': query_len,
            'document length': doc_len,
            '# of matched q/d terms': num_matched_terms,
            'min idf': min(idfs),
            'max idf': max(idfs),
            'min tf': min(tfs),
            'max tf': max(tfs),
            'phrase frequency': phrase_freq
#            'phrase match': phrase_match
           }


In [12]:
# Checking the features extraction function
extract_features(index_name='wiki',
                 query_text=train_queries['text_left'][4],
                 doc_id=887826,
                 verbose=True)


{
  "_index": "wiki",
  "_id": "887826",
  "matched": true,
  "explanation": {
    "value": 39.313705,
    "description": "sum of:",
    "details": [
      {
        "value": 20.014194,
        "description": "sum of:",
        "details": [
          {
            "value": 7.7756023,
            "description": "weight(text:pennsylvania in 8063) [PerFieldSimilarity], result of:",
            "details": [
              {
                "value": 7.7756023,
                "description": "score(freq=6.0), computed as boost * idf * tf from:",
                "details": [
                  {
                    "value": 2.2,
                    "description": "boost",
                    "details": []
                  },
                  {
                    "value": 4.247406,
                    "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                    "details": [
                      {
                        "value": 5287,
                      

{'bm25 score': 39.313705,
 'query length': 4,
 'document length': 200,
 '# of matched q/d terms': 4,
 'min idf': 0.009472072,
 'max idf': 4.4477615,
 'min tf': 0.76768476,
 'max tf': 0.9204185,
 'phrase frequency': 4.1}

## Costructing features dataframe

### Train data

#### Reading relevant pairs (original set)

In [13]:
train_relev_qd_pairs = pd.read_csv('wikIR1k/training/qrels', sep='\t', header=None)
train_relev_qd_pairs.columns = ['q_id', 'val', 'doc_id', 'relevance_label']
train_relev_qd_pairs

Unnamed: 0,q_id,val,doc_id,relevance_label
0,123839,0,123839,2
1,123839,0,1793430,1
2,123839,0,806300,1
3,123839,0,806075,1
4,123839,0,836567,1
...,...,...,...,...
47694,341793,0,1968690,1
47695,341793,0,1339149,1
47696,341793,0,1364202,1
47697,341793,0,1325652,1


#### Adding non-relevant pairs (extended set)

In [14]:
q_ids = train_relev_qd_pairs['q_id'].unique()
doc_ids = train_relev_qd_pairs['doc_id'].unique()

for q_id in tqdm(q_ids, total=len(q_ids)):
    add_doc_ids = random.sample(list(doc_ids), 10)
    doc_ids_to_add = []
    for doc_id in add_doc_ids:
        if doc_id not in train_relev_qd_pairs.loc[train_relev_qd_pairs['q_id']==q_id, 'doc_id'].values:
            doc_ids_to_add.append(doc_id)
    new_qid_df = pd.DataFrame({'q_id': [q_id]*len(doc_ids_to_add),
                               'val': [0]*len(doc_ids_to_add),
                               'doc_id': doc_ids_to_add,
                               'relevance_label': [0]*len(doc_ids_to_add)})
    train_relev_qd_pairs = pd.concat([train_relev_qd_pairs, new_qid_df]).sort_values(by=['q_id']).reset_index(drop=True)
    

100%|██████████████████████████████████████| 1444/1444 [00:08<00:00, 173.30it/s]


In [15]:
train_relev_qd_pairs

Unnamed: 0,q_id,val,doc_id,relevance_label
0,79,0,2415640,1
1,79,0,2193064,0
2,79,0,731995,0
3,79,0,1886924,0
4,79,0,2393413,0
...,...,...,...,...
62124,2433785,0,2433061,1
62125,2433785,0,730722,1
62126,2433785,0,2279766,0
62127,2433785,0,68013,0


#### Features dataframe (feature extraction applied on extended dataframe)

In [16]:
pairs = []

for i, row in tqdm(train_relev_qd_pairs.iterrows(), total=train_relev_qd_pairs.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
    query_text = train_queries.loc[train_queries['id_left']==row['q_id'], 'text_left'].item()
    curr_features = {
        'relevance': row['relevance_label'],
        'queryid': row['q_id'],
    }
    curr_features.update(extract_features(index_name='wiki', query_text=query_text, doc_id=row['doc_id']))
    pairs.append(curr_features)

train_df = pd.DataFrame(pairs)
train_df.to_csv('train_df.csv')


100%|██████████████████████████████| 62129/62129 [05:32<00:00, 186.66it/s]      


In [17]:
train_df.loc[9030:9050,:]

Unnamed: 0,relevance,queryid,bm25 score,query length,document length,# of matched q/d terms,min idf,max idf,min tf,max tf,phrase frequency
9030,1,4332,0.0,3,186,0,0.0,0.0,0.0,0.0,0.0
9031,0,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0
9032,0,4332,0.936617,3,200,1,0.941075,0.941075,0.452392,0.452392,0.0
9033,1,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0
9034,0,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0
9035,0,4332,1.475154,3,200,1,0.941075,0.941075,0.712509,0.712509,0.0
9036,0,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0
9037,1,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0
9038,1,4332,0.936617,3,200,1,0.941075,0.941075,0.452392,0.452392,0.0
9039,1,4332,0.0,3,200,0,0.0,0.0,0.0,0.0,0.0


### Test data

In [18]:
test_relev_qd_pairs = pd.read_csv('wikIR1k/test/qrels', sep='\t', header=None)
test_relev_qd_pairs.columns = ['q_id', 'val', 'doc_id', 'relevance_label']
test_relev_qd_pairs

Unnamed: 0,q_id,val,doc_id,relevance_label
0,158491,0,158491,2
1,158491,0,2130828,1
2,158491,0,730939,1
3,158491,0,1666627,1
4,158491,0,2102124,1
...,...,...,...,...
4430,712704,0,591264,1
4431,712704,0,908363,1
4432,712704,0,2004825,1
4433,712704,0,307988,1


In [19]:
q_ids = test_relev_qd_pairs['q_id'].unique()
doc_ids = test_relev_qd_pairs['doc_id'].unique()

for q_id in tqdm(q_ids, total=len(q_ids)):
    add_doc_ids = random.sample(list(doc_ids), 10)
    doc_ids_to_add = []
    for doc_id in add_doc_ids:
        if doc_id not in test_relev_qd_pairs.loc[test_relev_qd_pairs['q_id']==q_id, 'doc_id'].values:
            doc_ids_to_add.append(doc_id)
    new_qid_df = pd.DataFrame({'q_id': [q_id]*len(doc_ids_to_add),
                               'val': [0]*len(doc_ids_to_add),
                               'doc_id': doc_ids_to_add,
                               'relevance_label': [0]*len(doc_ids_to_add)})
    test_relev_qd_pairs = pd.concat([test_relev_qd_pairs, new_qid_df]).sort_values(by=['q_id']).reset_index(drop=True)
 

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 480.62it/s]


In [20]:
pairs = []

for i, row in tqdm(test_relev_qd_pairs.iterrows(), total=test_relev_qd_pairs.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
    query_text = test_queries.loc[test_queries['id_left']==row['q_id'], 'text_left'].item()
    curr_features = {
        'relevance': row['relevance_label'],
        'queryid': row['q_id'],
    }
    curr_features.update(extract_features(index_name='wiki', query_text=query_text, doc_id=row['doc_id']))
    pairs.append(curr_features)

test_df = pd.DataFrame(pairs)
test_df.to_csv('test_df.csv')


100%|██████████████████████████████| 5427/5427 [00:28<00:00, 188.62it/s]        


In [21]:
test_df

Unnamed: 0,relevance,queryid,bm25 score,query length,document length,# of matched q/d terms,min idf,max idf,min tf,max tf,phrase frequency
0,1,720,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
1,1,720,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
2,2,720,10.585438,2,200,1,7.723694,7.723694,0.622961,0.622961,0.0
3,1,720,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
4,1,720,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
5422,1,2136797,5.552353,2,200,1,5.578782,5.578782,0.452392,0.452392,0.0
5423,0,2136797,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
5424,0,2136797,0.000000,2,200,0,0.000000,0.000000,0.000000,0.000000,0.0
5425,1,2136797,9.881150,2,200,1,5.578782,5.578782,0.805092,0.805092,0.0


## ML with CatBoost

In [22]:
X_train = train_df.iloc[:, 2:].values
y_train = train_df.iloc[:, 0].values
queries_train = train_df.iloc[:, 1].values

X_test = test_df.iloc[:, 2:].values
y_test = test_df.iloc[:, 0].values
queries_test = test_df.iloc[:, 1].values


### Dataset analysis

In [23]:
train_num_documents = X_train.shape[0]
print('Number of train documents:', train_num_documents)

test_num_documents = X_test.shape[0]
print('Number of test documents:', test_num_documents)

# print('Distribution of relevance scores:')
# Counter(y_train).items()

# Normalization of relevance scores
max_relevance = np.max(y_train)
y_train = np.float64(y_train) / max_relevance
y_test = np.float64(y_test) / max_relevance

train_num_queries = np.unique(queries_train).shape[0]
print('Number of train queries (number of train query groups):', train_num_queries)

test_num_queries = np.unique(queries_test).shape[0]
print('Number of test queries (number of test query groups):', test_num_queries)

num_features = X_train.shape[1]
print('Number of features:', num_features)


Number of train documents: 62129
Number of test documents: 5427
Number of train queries (number of train query groups): 1444
Number of test queries (number of test query groups): 100
Number of features: 9


### Creation of CatBoost pools

In [24]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)


In [25]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model


### 1st Variant: PairLogit

In [26]:
pair_logit_model = fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [27]:
print('best iteration:', pair_logit_model.best_iteration_)
print(json.dumps(pair_logit_model.best_score_, indent=4))


best iteration: 434
{
    "learn": {
        "PairLogit": 0.2764341571462773
    },
    "validation": {
        "NDCG:type=Base": 0.9229695158201754,
        "PairLogit": 0.3584478230199964
    }
}


### 2nd Variant: YetiRank

In [28]:
yeti_rank_model = fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))