# Document retrieval

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET

import utils.manage_files

## Download datasets

### Download and exlore the corpus
We can dowload as first thing the corpus that we need to use for the task.

In [4]:
url_corpus = "https://zenodo.org/record/6802592/files/touche-task2-passages-version-002.jsonl.gz?download=1"
zip_path_corpus = "corpus.jsonl.gz"
file_path_corpus = "corpus.jsonl"

download_corpus = utils.manage_files.DownloadFile(file_path_corpus, zip_path_corpus, url_corpus)
download_corpus()

Downloading file: 100%|██████████| 286M/286M [00:17<00:00, 16.3MB/s]


'/content/downloads/zips/corpus.jsonl.gz' unzipped in '/content/downloads/corpus.jsonl'


In [6]:
corpus_df = pd.read_json(download_corpus.file_name, lines=True)
corpus_df.head()

Unnamed: 0,id,contents,chatNoirUrl
0,clueweb12-0000tw-14-21168___1,"Shuga: Love, Sex, Money MTV Shuga Home Swag Bl...",https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
1,clueweb12-0000tw-14-21168___2,We LOVE sending #TeamShuga the exclusives. Ban...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
2,clueweb12-0000tw-14-21168___3,Now take note.. because you will be seeing a w...,https://chatnoir.eu/cache?uuid=f338e91e-a3e9-5...
3,clueweb12-0000tw-22-19226___1,Sex and love: The modern matchmakers | The Eco...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...
4,clueweb12-0000tw-22-19226___2,But have they? Feb 11th 2012 | from the print ...,https://chatnoir.eu/cache?uuid=2bf4b08d-2f65-5...


### Download other datasets

At this point we can download:
- the list of topics to retrieve all the titles;
- the relevance qrels file;
- the quality qrels file.

The last 2 files contain a relevance and quality scores, associated to a list of documents, with respect to a certain topic.

The structure of a .qrels file is:   
TOPIC 0 DOC_ID SCORE

#### Topics

In [74]:
# Download and parse the xml file of the topics
url_topics = "https://zenodo.org/record/6873559/files/topics-task-2.zip?download=1"
zip_path_topics = "topics-task-2.zip"
file_path_topics = "topics-task-2"

download_topics = utils.manage_files.DownloadFile(file_path_topics, zip_path_topics, url_topics)
download_topics()

mytree = ET.parse(f"{download_topics.file_name}/topics-task-2.xml")
myroot = mytree.getroot()

topics = list()
for item in myroot:
    for x in item:        
        if x.tag == "title": # Specify the field, e.g., title
            topics.append(x.text.strip())

'/content/downloads/topics-task-2' already present


In [75]:
# We have the 50 topics pre-selected from the team
print(f"There are {len(topics)} topics.\n{topics}")

There are 50 topics.
['What is the difference between sex and love?', 'Which is better, a laptop or a desktop?', 'Which is better, Canon or Nikon?', 'What are the best dish detergents?', 'What are the best cities to live in?', 'What is the longest river in the U.S.?', 'Which is healthiest: coffee, green tea or black tea and why?', 'What are the advantages and disadvantages of PHP over Python and vice versa?', 'Why is Linux better than Windows?', 'How to sleep better?', 'Should I buy an LCD TV or a plasma TV?', 'Train or plane? Which is the better choice?', 'What is the highest mountain on Earth?', 'Should one prefer Chinese medicine or Western medicine?', 'What are the best washing machine brands?', 'Should I buy or rent?', 'Do you prefer cats or dogs, and why?', 'What is the better way to grill outdoors: gas or charcoal?', 'Which is better, MAC or PC?', 'What is better: to use a brush or a sponge?', 'Which is better, Linux or Microsoft?', 'Which is better, Pepsi or Coke?', 'What is be

#### Documents relevance for each topic

In [28]:
# Download relevance qrels
url_relevance = "https://zenodo.org/record/6873567/files/touche-task2-2022-relevance.qrels?download=1"
file_path_rel = "relevance.qrels"

download_relevance = utils.manage_files.DownloadFile(file_path_rel, url=url_relevance)
download_relevance()

Downloading file: 100%|██████████| 78.4k/78.4k [00:00<00:00, 512kB/s]


In [29]:
relevance_df = pd.read_csv(download_relevance.file_name, 
                           names=["topic", "0", "doc_id", "relevance"], sep=" ")

relevance_df.head()

Unnamed: 0,topic,0,doc_id,relevance
0,12,0,clueweb12-0002wb-18-34442___2,0
1,12,0,clueweb12-0004wb-69-30215___112,0
2,12,0,clueweb12-0004wb-78-20304___1,1
3,12,0,clueweb12-0004wb-78-20304___11,2
4,12,0,clueweb12-0008wb-62-05967___1,0


#### Documents quality for each topic

In [30]:
# Download relevance qrels
url_quality = "https://zenodo.org/record/6873567/files/touche-task2-2022-quality.qrels?download=1"
file_path_qual = "quality.qrels"

download_quality = utils.manage_files.DownloadFile(file_path_qual, url=url_quality)
download_quality()

Downloading file: 100%|██████████| 78.4k/78.4k [00:00<00:00, 493kB/s]


In [32]:
quality_df = pd.read_csv(download_quality.file_name, 
                           names=["topic", "0", "doc_id", "quality"], sep=" ")

quality_df.head()

Unnamed: 0,topic,0,doc_id,quality
0,12,0,clueweb12-0002wb-18-34442___2,2
1,12,0,clueweb12-0004wb-69-30215___112,2
2,12,0,clueweb12-0004wb-78-20304___1,2
3,12,0,clueweb12-0004wb-78-20304___11,2
4,12,0,clueweb12-0008wb-62-05967___1,0


### Example to test our comprehension of the datasets

In [99]:
def retrieve_doc_url(topic, relevance, rel_df, corp_df):
    print(f"Topic: {topic} - Relevance: {relevance}")
    id_list = rel_df[(rel_df['topic']==topic) & (rel_df['relevance']==relevance)]['doc_id']
    
    url_list = [corp_df[corp_df['id']==doc]['chatNoirUrl'].item() for doc in id_list]
    return url_list

In [106]:
# List of topics taken from qrels files.
np.sort(relevance_df.topic.unique())

array([  2,   3,   8,   9,  12,  14,  17,  18,  19,  22,  23,  25,  26,
        27,  28,  30,  33,  34,  36,  37,  42,  43,  48,  51,  53,  54,
        55,  56,  58,  59,  60,  61,  62,  67,  68,  69,  70,  72,  74,
        76,  77,  78,  84,  86,  88,  91,  92,  93,  95, 100])

In this example we retrieve the documents that tell about coke and pepsi and it's perfectly linked with the corrispondent element in the topics list.

In [114]:
retrieve_doc_url(22, 2, relevance_df, corpus_df)

Topic: 22 - Relevance: 2


['https://chatnoir.eu/cache?uuid=3bc329ee-561c-57a5-b3f8-aafeaedd8f2e&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=3bc329ee-561c-57a5-b3f8-aafeaedd8f2e&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=c025fcf1-a94c-527b-b1d4-6cc611158db7&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=866c99ee-bd46-5785-9adc-8ff91a461c97&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=7e56edda-fae8-5204-8405-bf82671782c8&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=c4bf5091-a6db-5e5d-9ed6-43328bd6451c&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=ce4a164e-d16b-5eb4-9316-2f0bab2b67b1&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=ce4a164e-d16b-5eb4-9316-2f0bab2b67b1&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=4ed41510-6cca-5c6b-a35c-d00d267a7c61&index=cw12&raw&plain']

In [116]:
topics[21]

'Which is better, Pepsi or Coke?'

But if we take an index greater than 50, we cannot find the relative argument in the list of topics.

In [117]:
retrieve_doc_url(68, 2, relevance_df, corpus_df)

Topic: 68 - Relevance: 2


['https://chatnoir.eu/cache?uuid=276071f3-3a6f-5c94-a9de-243e56c9020f&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=6919863e-05fa-587e-9fd9-1d1d3e450c40&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=af291c09-2f4e-546e-b78f-59d88645eb93&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=af291c09-2f4e-546e-b78f-59d88645eb93&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=6ce714c4-771b-545b-bba9-e9af68224de8&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=e6992587-24fb-5ebd-a534-b8d534b27bf1&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=810d0f49-ef8f-5f49-9961-4cf6832ac580&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=810d0f49-ef8f-5f49-9961-4cf6832ac580&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=2ed4492d-48ff-56bc-a5d8-de36d4e45f5e&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=2ed4492d-48ff-56bc-a5d8-de36d4e45f5e&index=cw12&raw&plain',
 'https://chatnoir.eu/cache?uuid=0ea29b80-b1cb-5454-b6e3-be769f9a90b9&