# Document retrieval

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET

import utils.manage_files

## Download datasets

### Download and exlore the corpus
We can dowload as first thing the corpus that we need to use for the task.

In [6]:
url_corpus = "https://zenodo.org/record/6802592/files/touche-task2-passages-version-002.jsonl.gz?download=1"
zip_path_corpus = "corpus.jsonl.gz"
file_path_corpus = "corpus.jsonl"

download_corpus = utils.manage_files.DownloadFile(file_path_corpus, zip_path_corpus, url_corpus)
download_corpus()

Downloading file: 100%|██████████| 286M/286M [03:28<00:00, 1.37MB/s]


'/content/downloads/zips/corpus.jsonl.gz' unzipped in '/content/downloads/corpus.jsonl'


In [7]:
corpus_df = pd.read_json(download_corpus.file_name, lines=True)
corpus_df.head()

Unnamed: 0,id,contents,chatNoirUrl
0,clueweb12-0000tw-14-21168___1,"Shuga: Love, Sex, Money MTV Shuga Home Swag Bl...",https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
1,clueweb12-0000tw-14-21168___2,We LOVE sending #TeamShuga the exclusives. Ban...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
2,clueweb12-0000tw-14-21168___3,Now take note.. because you will be seeing a w...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
3,clueweb12-0000tw-22-19226___1,Sex and love: The modern matchmakers | The Eco...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...
4,clueweb12-0000tw-22-19226___2,But have they? Feb 11th 2012 | from the print ...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...


### Download other datasets

At this point we can download:
- the list of topics to retrieve all the titles;
- the relevance qrels file;
- the quality qrels file.

The last 2 files contain a relevance and quality scores, associated to a list of documents, with respect to a certain topic.

The structure of a .qrels file is:   
TOPIC 0 DOC_ID SCORE

We can use these data to train our document retrieval model on labeled data from past years.

#### Topics

In [10]:
# Download and parse the xml file of the topics
url_topics_50 = "https://zenodo.org/record/6873559/files/topics-task-2.zip?download=1"
zip_path_topics_50 = "topics-task-2-50.zip"
file_path_topics_50 = "topics-task-2-50"

url_topics_100 = "https://zenodo.org/record/6873565/files/topics-task-2-2021.zip?download=1"
zip_path_topics_100 = "topics-task-2-100.zip"
file_path_topics_100 = "topics-task-2-100"

download_topics_50 = utils.manage_files.DownloadFile(file_path_topics_50, zip_path_topics_50, url_topics_50)
download_topics_50()

download_topics_100 = utils.manage_files.DownloadFile(file_path_topics_100, zip_path_topics_100, url_topics_100)
download_topics_100()

# First 50 topics
mytree = ET.parse(f"{download_topics_50.file_name}/topics-task-2.xml")
myroot = mytree.getroot()
topics = list()
for item in myroot:
    for x in item:        
        if x.tag == "title": # Specify the field, e.g., title
            topics.append(x.text.strip())

# Last 50 topics
mytree = ET.parse(f"{download_topics_100.file_name}/topics-task2-51-100.xml")
myroot = mytree.getroot()
for item in myroot:
    for x in item:        
        if x.tag == "title": # Specify the field, e.g., title
            topics.append(x.text.strip())

'/content/downloads/topics-task-2-50' already present
'/content/downloads/topics-task-2-100' already present


In [11]:
# We have the 50 topics pre-selected from the team
print(f"There are {len(topics)} topics.\n{topics}")

There are 100 topics.
['What is the difference between sex and love?', 'Which is better, a laptop or a desktop?', 'Which is better, Canon or Nikon?', 'What are the best dish detergents?', 'What are the best cities to live in?', 'What is the longest river in the U.S.?', 'Which is healthiest: coffee, green tea or black tea and why?', 'What are the advantages and disadvantages of PHP over Python and vice versa?', 'Why is Linux better than Windows?', 'How to sleep better?', 'Should I buy an LCD TV or a plasma TV?', 'Train or plane? Which is the better choice?', 'What is the highest mountain on Earth?', 'Should one prefer Chinese medicine or Western medicine?', 'What are the best washing machine brands?', 'Should I buy or rent?', 'Do you prefer cats or dogs, and why?', 'What is the better way to grill outdoors: gas or charcoal?', 'Which is better, MAC or PC?', 'What is better: to use a brush or a sponge?', 'Which is better, Linux or Microsoft?', 'Which is better, Pepsi or Coke?', 'What is b

#### Documents relevance for each topic

In [14]:
# Download relevance qrels first 50 topics
url_relevance_50 = "https://zenodo.org/record/6873567/files/touche-task2-2022-relevance.qrels?download=1"
file_path_rel_50 = "relevance-50.qrels"

download_relevance_50 = utils.manage_files.DownloadFile(file_path_rel_50, url=url_relevance_50)
download_relevance_50()

# Download relevance qrels last 50 topics
url_relevance_100 = "https://zenodo.org/record/6873565/files/touche-task2-51-100-relevance.qrels?download=1"
file_path_rel_100 = "relevance-100.qrels"

download_relevance_100 = utils.manage_files.DownloadFile(file_path_rel_100, url=url_relevance_100)
download_relevance_100()

'/content/downloads/relevance-50.qrels' already present
'/content/downloads/relevance-100.qrels' already present


In [35]:
rel_1 = pd.read_csv(download_relevance_50.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "relevance"], sep=" ")
rel_2 = pd.read_csv(download_relevance_100.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "relevance"], sep=" ")

relevance_df = pd.concat([rel_1, rel_2], axis=0, ignore_index=True) \
                .drop_duplicates('doc_id') \
                .reset_index(drop=True) \
                .drop('0', axis=1)

relevance_df.head()

Unnamed: 0,topic,doc_id,relevance
0,12,clueweb12-0002wb-18-34442___2,0
1,12,clueweb12-0004wb-69-30215___112,0
2,12,clueweb12-0004wb-78-20304___1,1
3,12,clueweb12-0004wb-78-20304___11,2
4,12,clueweb12-0008wb-62-05967___1,0


#### Documents quality for each topic

In [31]:
# Download relevance qrels first 50 topics
url_quality_50 = "https://zenodo.org/record/6873567/files/touche-task2-2022-quality.qrels?download=1"
file_path_qual_50 = "quality-50.qrels"

download_quality_50 = utils.manage_files.DownloadFile(file_path_qual_50, url=url_quality_50)
download_quality_50()

# Download relevance qrels first 50 topics
url_quality_100 = "https://zenodo.org/record/6873565/files/touche-task2-51-100-quality.qrels?download=1"
file_path_qual_100 = "quality-100.qrels"

download_quality_100 = utils.manage_files.DownloadFile(file_path_qual_100, url=url_quality_100)
download_quality_100()

Downloading file: 100%|██████████| 78.4k/78.4k [00:00<00:00, 511kB/s]
Downloading file: 100%|██████████| 68.5k/68.5k [00:00<00:00, 450kB/s]


In [36]:
qual_1 = pd.read_csv(download_quality_50.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "quality"], sep=" ")
qual_2 = pd.read_csv(download_quality_100.file_name, index_col=None, 
                    names=["topic", "0", "doc_id", "quality"], sep=" ")

quality_df = pd.concat([qual_1, qual_2], axis=0, ignore_index=True) \
                .drop_duplicates('doc_id') \
                .reset_index(drop=True) \
                .drop('0', axis=1)

quality_df.head()

Unnamed: 0,topic,doc_id,quality
0,12,clueweb12-0002wb-18-34442___2,2
1,12,clueweb12-0004wb-69-30215___112,2
2,12,clueweb12-0004wb-78-20304___1,2
3,12,clueweb12-0004wb-78-20304___11,2
4,12,clueweb12-0008wb-62-05967___1,0


#### Merge data

Now we want to merge the data in order to have the relevance and quality score in the same dataframe.

In [42]:
det_df = relevance_df.merge(quality_df, on=['doc_id', 'topic'])
det_df.head()

Unnamed: 0,topic,doc_id,relevance,quality
0,12,clueweb12-0002wb-18-34442___2,0,2
1,12,clueweb12-0004wb-69-30215___112,0,2
2,12,clueweb12-0004wb-78-20304___1,1,2
3,12,clueweb12-0004wb-78-20304___11,2,2
4,12,clueweb12-0008wb-62-05967___1,0,0


### Example to test our comprehension of the datasets

In [76]:
def retrieve_doc_url(topic, relevance, det_df, corp_df):
    print(f"Topic: {topic} - Relevance: {relevance}")
    id_list = det_df[(det_df['topic']==topic) & (det_df['relevance']==relevance)]['doc_id']
    url_list = []
    for doc in id_list:
        if doc in corp_df.id.values:
            url_list.append(corp_df[corp_df['id']==doc]['chatNoirUrl'].item()) 
    return url_list

In [46]:
# List of topics taken from qrels files.
np.sort(det_df.topic.unique())

array([  2,   3,   8,   9,  12,  14,  17,  18,  19,  22,  23,  25,  26,
        27,  28,  30,  33,  34,  36,  37,  42,  43,  48,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100])

In this example we retrieve the documents that tell about coke and pepsi and it's perfectly linked with the corrispondent element in the topics list.

In [88]:
example_topic = 55
retrieve_doc_url(example_topic, 2, det_df, corpus_df)

Topic: 55 - Relevance: 2


['https://chatnoir.eu/cache?uuid=00c926b3-aea6-5676-a05b-631183bc16d6&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=3ba792cf-65e0-50a7-8d2a-29dfe3450844&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0cbebae4-a185-5630-9bbf-9df00049ff6d&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=d09e9ab3-a8b4-5800-b926-52ae7e625e4c&index=cw12&raw&plain']

In [89]:
topics[example_topic-1]

'What is better for back pain, chiropractic therapy or physical therapy?'