In [None]:
!pip install gensim
!pip install python-terrier
!python -m spacy download en_core_web_sm

In [1]:
import json
import numpy as np
from tools import read_data, text_constructor, data_cleaning
#we need to import the following libraries.
import pandas as pd
#to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)
import gensim.downloader
w2vec = gensim.downloader.load('word2vec-google-news-300')
import tarfile
import urllib

In [2]:
import pyterrier as pt
if not pt.started():
    pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
[INFO] trec-robust04 is deprecated. Consider using disks45/nocr/trec-robust-2004 instead, which provides better parsing of the corpus.
[INFO] trec-robust04/fold1 is deprecated. Consider using disks45/nocr/trec-robust-2004/fold1 instead, which provides better parsing of the corpus.
[INFO] trec-robust04/fold2 is deprecated. Consider using disks45/nocr/trec-robust-2004/fold2 instead, which provides better parsing of the corpus.
[INFO] trec-robust04/fold3 is deprecated. Consider using disks45/nocr/trec-robust-2004/fold3 instead, which provides better parsing of the corpus.
[INFO] trec-robust04/fold4 is deprecated. Consider using disks45/nocr/trec-robust-2004/fold4 instead, which provides better parsing of the corpus.
[INFO] trec-robust04/fold5 is deprecated. Consider using disks45/nocr/trec-robust-2004/fold5 instead, 

### Download and read the dataset


In [None]:
link = "http://ir.dcs.gla.ac.uk/resources/test_collections/cisi/cisi.tar.gz"
downloaded_filename = 'cisi.tar.gz'
urllib.request.urlretrieve(link, downloaded_filename)
tar = tarfile.open(downloaded_filename, "r:gz")
tar.extractall("./data/")
tar.close()

### Text Preprocessing


    Documents:
        Lowercase the text
        Expand Contractions
        Clean the text
        Remove Stopwords
        Lemmatize words
    Queries:
        Lowercase the text
        Expand Contractions
        Clean the text


In [3]:

#reading and cleaning the documents
all_lines = read_data(path = './data/CISI.ALL')
doc_dict = text_constructor(all_lines)
cleaned_doc_df = data_cleaning(doc_dict)
cleaned_doc_df['lemmatized'][0]

#reading and cleaning the queries
query_lines = read_data(path = './data/CISI.QRY')
query_dict = text_constructor(query_lines, query=True)
cleaned_query_df = data_cleaning(query_dict)
cleaned_query_df['lemmatized'][0]
cleaned_query_df.rename(columns={"lemmatized":'query', "docno":'qid'}, inplace=True)


#reading and golden truth
### Processing QRELS
rel_set = {}
with open('data/CISI.REL') as f:
    for l in f.readlines():
        qry_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]) -1
        doc_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1])-1
        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)
### TERC format
qrel = []
all_docs = cleaned_doc_df['docno'].apply(int).to_list()
for qid, docno in rel_set.items():
    irrelevant_doc = set(all_docs) - set(docno)
    for doc in all_docs:
        if doc in  docno:
            qrel.append({'qid': qid, 'docno': doc, 'label': 1})
        else:
            qrel.append({'qid': qid, 'docno': doc, 'label': 0})
qrel = pd.DataFrame(qrel)
qrel['qid'] = qrel['qid'].astype(str)
qrel['docno'] = qrel['docno'].astype(str)


Number of documents = 1460.

Number of documents = 112.



In [6]:
#
#BIO INDEXING (Positional)
index_dir = './INDEX'
indexer = pt.DFIndexer(index_dir, overwrite=True, blocks=True)
indexer.setProperty("stemmer", "")
index_ref = indexer.index(cleaned_doc_df['lemmatized'], cleaned_doc_df["docno"])


In [7]:
#Get Statistics
index = pt.IndexFactory.of(index_ref) #load the index
print(index.getCollectionStatistics().toString())

Number of documents: 1460
Number of terms: 6871
Number of postings: 70366
Number of fields: 0
Number of tokens: 95753
Field names: []
Positions:   true



In [8]:
BM25 = pt.BatchRetrieve(index, wmodel="BM25")
DPH  = pt.BatchRetrieve(index, wmodel="DPH")
PL2  = pt.BatchRetrieve(index, wmodel="PL2")
DLM  = pt.BatchRetrieve(index, wmodel="DirichletLM")
#Sequential
sdm_pipe = BM25 >> PL2

#rand_docs = pt.BatchRetrieve(index, wmodel=random_weighting) 
pt.Experiment(
    [BM25, DPH, PL2, DLM, sdm_pipe],
    cleaned_query_df,
    qrel,
    eval_metrics=["map", "P_10", "P_20", "ndcg_cut_10"],
    names=["BM25", "DPH", "PL2", "Dirichlet QL", "sdm"]
)


Unnamed: 0,name,map,P_10,P_20,ndcg_cut_10
0,BM25,0.219214,0.356579,0.274342,0.392456
1,DPH,0.201515,0.331579,0.271711,0.371899
2,PL2,0.221728,0.340789,0.269079,0.387944
3,Dirichlet QL,0.151678,0.231579,0.203289,0.258369
4,sdm,0.221645,0.340789,0.269079,0.387944


In [9]:
#Create BM25 retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
pl2 = pt.BatchRetrieve(index, wmodel="PL2")
#Search for 'analyst', merge user's original bios
bm25.search("mathematic").merge(cleaned_doc_df[['docno', 'lemmatized']], on="docno")[:5]

Unnamed: 0,qid,docid,docno,rank,score,query,lemmatized
0,1,461,461,0,8.03188,mathematic,information retrieval van rijsbergen c j material book aim advanced undergraduate information computer science student postgraduate library scienc...
1,1,1384,1384,1,7.653332,mathematic,structural model introduction theory direct graphs harary f purpose book present introduction body mathematic concern abstract notion structure pr...
2,1,27,27,2,7.472434,mathematic,note pseudo mathematic relevance taube m recently number article book report deal information system i e document retrieval system advance doctrin...
3,1,1241,1241,3,6.502652,mathematic,quantitative method hoadley i b institute design encourage use quantitative measurement teach technique necessary use hope emphasis statistical me...
4,1,1043,1043,4,6.501272,mathematic,mathematical taxonomy jardine n book mathematical account method datum simplification involve suggest practice biological taxonomy computable meth...


In [11]:
SEED=0
from sklearn.model_selection import train_test_split
tr_va_queries, test_queries = train_test_split(cleaned_query_df, test_size=0.2, random_state=SEED)
train_queries, valid_queries =  train_test_split(tr_va_queries, test_size=0.3, random_state=SEED)

valid_qrels = qrel[qrel['qid'].isin(list(valid_queries['qid']))]
test_qrels = qrel[qrel['qid'].isin(list(test_queries['qid']))]
train_qrels = qrel[qrel['qid'].isin(list(train_queries['qid']))]

In [12]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, n_jobs=2)
bm25 = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"})
featureset_01 = (bm25) >> (
    pt.transformer.IdentityTransformer()

    **
    pt.BatchRetrieve(index, wmodel="PL2")

    )
pipeline = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"])
rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_queries, qrel)


In [13]:
import lightgbm as lgb
pipeline = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"])
# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(task="train",
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=50,
    max_bin=255,
    num_leaves=7,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3, 5, 10],
    learning_rate= .1,
    importance_type="gain",
    random_state=52)
lmart_l_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_l, form="ltr")
lmart_l_pipe.fit(train_queries, train_qrels, valid_queries, valid_qrels)




[1]	valid_0's ndcg@1: 0.222222	valid_0's ndcg@3: 0.25512	valid_0's ndcg@5: 0.271397	valid_0's ndcg@10: 0.290063
[2]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.363956	valid_0's ndcg@5: 0.371144	valid_0's ndcg@10: 0.372548
[3]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.410473	valid_0's ndcg@10: 0.393804
[4]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.410473	valid_0's ndcg@10: 0.390178
[5]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.410473	valid_0's ndcg@10: 0.390539
[6]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.410473	valid_0's ndcg@10: 0.390539
[7]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.407407	valid_0's ndcg@5: 0.397138	valid_0's ndcg@10: 0.384817
[8]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.405063	valid_0's ndcg@10: 0.39039
[9]	valid_0's ndcg@1: 0.407407	valid_0's ndcg@3: 0.418373	valid_0's ndcg@5: 0.405063	valid_0's ndc

In [14]:
# Evaluate the solutions
res = pt.Experiment([BM25, rf_pipe, lmart_l_pipe], 
              test_queries, 
              test_qrels, 
              eval_metrics=["map", "ndcg", "ndcg_cut_10", "mrt", "P.10"], 
              names=["BM25", "Random Forest", "Lambdarank",], perquery=False)
res.sort_values('P.10', ascending=False)[['name', 'P.10', 'ndcg_cut_10', 'ndcg', 'map', 'mrt']]

Unnamed: 0,name,P.10,ndcg_cut_10,ndcg,map,mrt
0,BM25,0.288889,0.336563,0.506329,0.167434,15.347562
2,Lambdarank,0.183333,0.203143,0.438112,0.111442,5.417316
1,Random Forest,0.083333,0.076873,0.370742,0.060003,24.776144


In [None]:
# Perform grid search on BM25 to get c, k1, and k3
BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"c" : 0.75, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
gridsearch_bm25 = pt.GridSearch(
    BM25,
    {BM25: {"c" : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ],
            "bm25.k_1": [0.3, 0.6, 0.9, 1.2, 1.4, 1.6, 2],
            "bm25.k_3": [0.5, 2, 4, 6, 8, 10, 12, 14, 20]
    }},
    cleaned_query_df,
    qrel,
    "P.10")

In [17]:
BM25_opt = pt.BatchRetrieve(index, wmodel="BM25", controls={"c" : 0.7, "bm25.k_1": 2, "bm25.k_3": 14})

#rand_docs = pt.BatchRetrieve(index, wmodel=random_weighting) 
pt.Experiment(
    [BM25_opt],
    cleaned_query_df,
    qrel,
    eval_metrics=["map", "P_10", "P_20", "ndcg_cut_10"],
    names=["BM25"]
)

Unnamed: 0,name,map,P_10,P_20,ndcg_cut_10
0,BM25,0.223387,0.355263,0.276316,0.395832


In [66]:

from random import randint
import random

labels = ['image', 'science', 'business', 'micrographic', 'retrieval',  'clustering', 'classification', 'automatic']

def generate_user_dataset(n_users, max_preferences, labels):
    users = {} # id, list of preferred categories
    if max_preferences > len(labels):
        max_preferences = len(labels)
    random.seed(62)
    for i in range(0, n_users):
        prefs = random.sample(range(0, 7), randint(1, max_preferences))
        prefs_mapped = [labels[k] for k in prefs]
        
        users[f'user_{i+1}'] = prefs_mapped

    return users
    

In [67]:
user_profils = {}
user_profils = generate_user_dataset(10, 5, labels)

In [68]:
user_profils

{'user_1': ['science', 'image', 'classification', 'micrographic', 'retrieval'],
 'user_2': ['science', 'retrieval', 'classification'],
 'user_3': ['image', 'business'],
 'user_4': ['classification', 'image'],
 'user_5': ['micrographic'],
 'user_6': ['retrieval', 'image', 'business', 'science'],
 'user_7': ['clustering', 'image'],
 'user_8': ['retrieval', 'micrographic', 'classification'],
 'user_9': ['retrieval'],
 'user_10': ['business', 'classification', 'clustering', 'micrographic']}

In [69]:


def profile_enrichment(user_name):
    offline_profile_enrichment = []
    for interest in user_profils[user_name]:
        offline_profile_enrichment.extend([w[0]for w in w2vec.most_similar(interest)[:2]])
    return offline_profile_enrichment

In [70]:
offline_profile_enrichment = {}
for user,_ in user_profils.items():
    user_profils[user].extend(profile_enrichment(user))

In [71]:
user_profils

{'user_1': ['science',
  'image',
  'classification',
  'micrographic',
  'retrieval',
  'faith_Jezierski',
  'sciences',
  'images',
  'visage',
  'classifications',
  'Classification',
  'networked_multifunction',
  'endoscopic_surgical_navigation',
  'retrieving',
  'storage_retrieval'],
 'user_2': ['science',
  'retrieval',
  'classification',
  'faith_Jezierski',
  'sciences',
  'retrieving',
  'storage_retrieval',
  'classifications',
  'Classification'],
 'user_3': ['image', 'business', 'images', 'visage', 'businesses', 'busines'],
 'user_4': ['classification',
  'image',
  'classifications',
  'Classification',
  'images',
  'visage'],
 'user_5': ['micrographic',
  'networked_multifunction',
  'endoscopic_surgical_navigation'],
 'user_6': ['retrieval',
  'image',
  'business',
  'science',
  'retrieving',
  'storage_retrieval',
  'images',
  'visage',
  'businesses',
  'busines',
  'faith_Jezierski',
  'sciences'],
 'user_7': ['clustering',
  'image',
  'Clustering',
  'cluster

### Example

User number 3 want to make a search 

In [73]:
BM25(cleaned_query_df.sample(1, random_state=5)[['qid', 'query']])

Unnamed: 0,qid,docid,docno,rank,score,query
0,37,211,211,0,13.484085,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
1,37,1214,1214,1,13.346277,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
2,37,314,314,2,11.196459,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
3,37,50,50,3,11.122845,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
4,37,566,566,4,10.940563,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
...,...,...,...,...,...,...
995,37,487,487,995,1.142931,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
996,37,1437,1437,996,1.142111,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
997,37,1,1,997,1.141191,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...
998,37,1253,1253,998,1.139873,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...


In [72]:
enrichment = " ".join(user_profils['user_3'])
expanded_query = pd.DataFrame()
expanded_query['query'] = pd.Series(" ".join([enrichment, str(cleaned_query_df.sample(1, random_state=5)['query'])]))
expanded_query['qid'] = pd.Series('1')
expanded_query['query']= expanded_query['query'].apply(lambda x: re.sub(r"[\.\,\#_\|\:\?\?\/\=]", ' ',x))
BM25(expanded_query)


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,79,79,0,14.033388,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
1,1,566,566,1,13.627169,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
2,1,522,522,2,12.474405,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
3,1,314,314,3,12.243810,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
4,1,578,578,4,12.220191,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
...,...,...,...,...,...,...
995,1,155,155,995,0.924238,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
996,1,104,104,996,0.914303,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
997,1,1024,1024,997,0.895073,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...
998,1,1037,1037,998,0.883325,image business images visage businesses busines 37 access word information retrieval system keep date word meaning usage change list dynamic cu...


## Comparison

In [61]:
# The query used in the evaluation
cleaned_query_df.sample(1, random_state=5)[['qid', 'query']]

Unnamed: 0,qid,query
37,37,access word information retrieval system keep date word meaning usage change list dynamic current definition problem progress solution provide nec...


In [74]:
#  document at rank 0 before query expansion
cleaned_doc_df[cleaned_doc_df['docno'] == "211"]

Unnamed: 0,raw,cleaned,lemmatized,docno
211,"Thesaural Problems in an On-Line System Cain, Alexander M. This paper describes the construction of a synonym thesaurus or entry vocabulary for th...","thesaural problems in an on-line system cain, alexander m. this paper describes the construction of a synonym thesaurus or entry vocabulary for th...",thesaural problem line system cain alexander m paper describe construction synonym thesaurus entry vocabulary suny biomedical communication networ...,211


In [75]:
#  document at rank 0 after query expansion
cleaned_doc_df[cleaned_doc_df['docno'] == "79"]

Unnamed: 0,raw,cleaned,lemmatized,docno
79,"A Graphic Catalog Card Index Lewis, Elizabeth M. To improve accessibility and maintenance of art slides in the United States Military Academy Libr...","a graphic catalog card index lewis, elizabeth m. to improve accessibility and maintenance of art slides in the united states military academy libr...",graphic catalog card index lewis elizabeth m improve accessibility maintenance art slide united states military academy library west point model c...,79
