#### Run this notebook as a sanity check for your implementation

### Creating search pipelines for two different ranking strategies.

* Pipeline 1: Initial ranking by BM25 with re-ranking by LambdaMART (Cross-encoder feature enabled)

* Pipeline 2: Initial ranking by Bi-Encoder vector ranker with re-ranking by LambdaMART (Cross-encoder feature enabled)

The corpus for the main index is augmented by doc2query queries

In [None]:
import csv
from collections import Counter, defaultdict
from tqdm import tqdm
import json
import numpy as np

# your modules are imported here
from indexing import Indexer, IndexType, BasicInvertedIndex
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
from ranker import Ranker, BM25, CrossEncoderScorer
from vector_ranker import VectorRanker
from network_features import NetworkFeatures
from l2r import L2RFeatureExtractor, L2RRanker

In [None]:
# change these to point to actual file paths
STOPWORD_PATH = '../data/stopwords.txt'
DATASET_PATH = '../data/wikipedia_200k_dataset.jsonl'
EDGELIST_PATH = '../data/edgelist.csv'
NETWORK_STATS_PATH = '../data/network_stats.csv'
DOC2QUERY_PATH = '../data/doc2query.csv'
MAIN_INDEX = 'main_index_augmented'
TITLE_INDEX = 'title_index'
RELEVANCE_TRAIN_DATA = '../data/hw2_relevance.train.csv'
ENCODED_DOCUMENT_EMBEDDINGS_NPY_DATA = '../data/wiki-200k-vecs.msmarco-MiniLM-L12-cos-v5.npy'

In [None]:
# Load in the stopwords

stopwords = set()
with open(STOPWORD_PATH, 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

In [None]:
# Get the list of categories for each page (either compute it or load the pre-computed list)
docid_to_categories = {}
with open(DATASET_PATH, 'rt', encoding='utf-8') as file:
    for line in tqdm(file, total=200_000):
        document = json.loads(line)
        docid_to_categories[document['docid']] = document['categories']
f'Document categories collected'

In [None]:
# Get or pre-compute the list of categories at least the minimum number of times (specified in the homework)
category_counts = Counter()
for cats in tqdm(docid_to_categories.values(), total=len(docid_to_categories)):
    for c in cats:
        category_counts[c] += 1
recognized_categories = set(
    [cat for cat, count in category_counts.items() if count >= 1000])
print("saw %d categories" % len(recognized_categories))

# Map each document to the smallert set of categories that occur frequently
doc_category_info = {}
for docid, cats in tqdm(docid_to_categories.items(), total=len(docid_to_categories)):
    valid_cats = [c for c in cats if c in recognized_categories]
    doc_category_info[docid] = valid_cats

In [None]:
network_features = {}
# Get or load the network statistics for the Wikipedia link network

if True:
    nf = NetworkFeatures()
    print('loading network')
    graph = nf.load_network(EDGELIST_PATH, total_edges=92650947)
    print('getting stats')
    net_feats_df = nf.get_all_network_statistics(graph)
    graph = None
    print('Saving')
    net_feats_df.to_csv(NETWORK_STATS_PATH, index=False)

    print("converting to dict format")
    network_features = defaultdict(dict)
    for i, row in tqdm(net_feats_df.iterrows(), total=len(net_feats_df)):
        for col in ['pagerank', 'hub_score', 'authority_score']:
            network_features[row['docid']][col] = row[col]
    net_feats_df = None
else:
    with open(NETWORK_STATS_PATH, 'r', encoding='utf-8') as file:
        for idx, line in enumerate(file):
            if idx == 0:
                continue
            else:
                # the indexes may change depending on your CSV
                splits = line.strip().split(',')
                network_features[int(splits[0])] = {
                    'pagerank': float(splits[1]),
                    'authority_score': float(splits[2]),
                    'hub_score': float(splits[3])
                }
f'Network stats collection {len(network_features)}'

In [None]:
d2q = Doc2QueryAugmenter()
text = 'The Evil Within is a survival horror video game developed by Tango Gameworks and published by Bethesda Softworks. The game was directed by Resident Evil series creator Shinji Mikami and was released worldwide in October 2014 for PlayStation 3, PlayStation 4, Windows, Xbox 360, and Xbox One.'
d2q.get_queries(text, 5)

In [None]:
doc_augment_dict = defaultdict(lambda: [])
with open(DOC2QUERY_PATH, 'r', encoding='utf-8') as file:
    dataset = csv.reader(file)
    for idx, row in tqdm(enumerate(dataset), total=600_000):
        if idx == 0:
            continue
        doc_augment_dict[int(row[0])].append(row[2])

In [None]:
# Load or build Inverted Indices for the documents' main text and titles
#
# Estiamted times:
#    Document text token counting: 4 minutes
#    Document text indexing: 5 minutes
#    Title text indexing: 30 seconds
preprocessor = RegexTokenizer('\w+')

# Creating and saving the index

main_index = Indexer.create_index(
        IndexType.InvertedIndex, DATASET_PATH, preprocessor,
        stopwords, 50, doc_augment_dict=doc_augment_dict)
main_index.save(MAIN_INDEX)

title_index = Indexer.create_index(
        IndexType.InvertedIndex, DATASET_PATH, preprocessor,
        stopwords, 2, text_key='title')
title_index.save(TITLE_INDEX)

# Loading a preloaded index
# main_index = BasicInvertedIndex()
# main_index.load(MAIN_INDEX)

# title_index = BasicInvertedIndex()
# title_index.load(TITLE_INDEX)

In [None]:
# create the raw text dictionary by going through the wiki dataset
# this dictionary should store only the first 500 characters of the raw documents text

raw_text_dict = {}
with open(DATASET_PATH, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        raw_text_dict[int(data['docid'])] = data['text'][:500]

In [None]:
# Create the feature extractor. This will be used by both pipelines
cescorer = CrossEncoderScorer(raw_text_dict)
fe = L2RFeatureExtractor(main_index, title_index, doc_category_info,
                         preprocessor, stopwords, recognized_categories,
                         network_features, cescorer)

In [None]:
# Create an intial ranker for determining what to re-rank
# Use these with a L2RRanker and then train that L2RRanker model
#
# Estimated time (using 4 cores via n_jobs): 7 minutes

# An initial ranking with BM25 with reranking done by LambdaMART optimizing NDCG
bm25 = BM25(main_index)
ranker = Ranker(main_index, preprocessor, stopwords, bm25)

pipeline_1 = L2RRanker(main_index, title_index, preprocessor,
                       stopwords, ranker, fe)

pipeline_1.train(RELEVANCE_TRAIN_DATA)

# An initial ranking with VectorRanker with reranking done by LambdaMART optimizing NDCG
with open(ENCODED_DOCUMENT_EMBEDDINGS_NPY_DATA, 'rb') as file:
    encoded_docs = np.load(file)

vector_ranker = VectorRanker('sentence-transformers/msmarco-MiniLM-L12-cos-v5',
                             encoded_docs, list(main_index.document_metadata.keys()))

pipeline_2 = L2RRanker(main_index, title_index, preprocessor,
                       stopwords, vector_ranker, fe)

pipeline_2.train(RELEVANCE_TRAIN_DATA)

In [None]:
import requests

def get_wiki_title(page_id:int):
    url = (
        'https://en.wikipedia.org/w/api.php'
        '?action=query'
        '&prop=info'
        '&inprop=subjectid'
        f'&pageids={page_id}' 
        '&format=json')
    json_response = requests.get(url).json()
    return json_response['query']['pages'][str(page_id)]['title']

After this point, students may have varying answers and observations depending on their implementation and their own features. So, your mileage may vary (YMMV)

### Example Query: 'How did the Mongols conquer China?'

This query should lead to pages about the different Mongolian invasions of China (there were multiple).

In [None]:
[(get_wiki_title(doc),score) for doc, score in pipeline_1.query('How did the Mongols conquer China?')[:10]]

In [None]:
[(get_wiki_title(doc),score) for doc, score in pipeline_2.query('How did the Mongols conquer China?')[:10]]

**The first result is pretty similar**

But the difference is in what lies after maybe the second rank. You would see that the vector ranker would provide better pages overall in the top ranks of the fetched document list (in our experience the vector ranker pipeline focused more on conquests by mongols rather than details about mongols themselves).

### Example of a query where both the pipelines perform badly

**top 10 video games** 

In [None]:
[(get_wiki_title(doc),score) for doc, score in pipeline_1.query('top 10 video games')[:10]]

In [None]:
[(get_wiki_title(doc),score) for doc, score in pipeline_2.query('top 10 video games')[:10]]

Looking at the results, it doesn't seem that the search engine does very well on this query. Why might that be? Could you think of ways to handle these types of queries? What about other queries where the search engine might just not be very good?