In [28]:
from tira_utils import get_input_directory_and_output_directory, normalize_run
import pyterrier as pt
import pandas as pd
import os
import json
from tqdm import tqdm
from glob import glob
from pathlib import Path

SYSTEM_NAME = os.environ.get('TIRA_SYSTEM_NAME' ,'my-retrieval-system')

if not pt.started():
    # tira_utils above should already have done started pyterrier with this configuration to ensure that no internet connection is required (for reproducibility)
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_directory, output_directory = get_input_directory_and_output_directory(default_input='/workspace/dataset22/')


I will use a small hardcoded example located in /workspace/dataset22/.
The output directory is /tmp/


In [None]:
pd.read_xml('/workspace/dataset22/topics-task3.xml')

Unnamed: 0,number,title,description,narrative
0,51,Do we need sex education in schools?,An adult user now has a partner for the first ...,"On-topic images address, for example, the trea..."
1,52,Should stem cell research be expanded?,A user recently learned that furthering new li...,"On-topic images address, for example, the use ..."
2,53,Should blood donations be financially compensa...,A user reads a report that states that hospita...,"On-topic images address, for example, money an..."
3,54,Should suicide be a criminal offense?,A user is watching the news on TV and hears ab...,"On-topic images address, for example, the topi..."
4,55,Should agricultural subsidies be reduced?,A user hears that the EU pays billions of euro...,"On-topic images address, for example, agricult..."
5,56,Should vigilantism be legal?,A user was following the case of a criminal in...,"On-topic images address, for example, either t..."
6,57,Are gender or racial quotas effective?,A female user recently got a job and now learn...,"On-topic images address, for example, effects ..."
7,58,Should holders of public offices resign on bad...,"Recently, it became known that a minister wast...","On-topic images address, for example, the perf..."
8,59,Should nuclear weapons be abolished?,"With the war in Ukraine, nuclear weapons have ...","On-topic images address nuclear weapons, for e..."
9,60,Should the press be subsidized?,A user is annoyed by the advertising on the we...,"On-topic images address, for example, methods ..."


In [43]:
print('Step 2: Load the queries.')

def load_queries():
    file_name = input_directory + 'topics-task3.xml'
    
    if not os.path.exists(file_name):
        raise ValueError(f'Could not find the file "{file_name}". Got: {glob(input_directory + "/*")}')
    
    topics = pd.read_xml(file_name)
    
    ret = pd.DataFrame()
    # https://github.com/terrier-org/pyterrier/issues/62\n",
    ret['query'] = topics['title'].apply(lambda i: "".join([x if x.isalnum() else " " for x in i]))
    ret['qid'] = topics['number']
    return ret

queries = load_queries()  
queries.head(2)

Step 2: Load the queries.


Unnamed: 0,query,qid
0,Do we need sex education in schools,51
1,Should stem cell research be expanded,52


### Step 3: Index the images

In [40]:
print('Step 3: Create the Index.')

# We use some very baseline method to get a textual representation: we just use the text of the pages that contain the image.
def load_image_text(image_id):
    ret = ''
    
    for txt_file in glob(input_directory +'/images/' + image_id[:3] + '/' + image_id + '/*/*/*/text.txt'):
        ret += '\n\n' + open(txt_file).read()
        
    return ret.strip()

def all_images():
    for i in glob(input_directory + '/images/*/*'):
        image_id = i.split('/')[-1]
        yield {'docno': image_id, 'text': load_image_text(image_id)}


!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'text': 4096})
index_ref = iter_indexer.index(tqdm(all_images()))


Step 3: Create the Index.


796it [00:07, 99.71it/s] 

In [50]:
print('Step 5: Define the Pipeline.')

retrieval_pipeline = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose=True, num_results=50)

Step 5: Define the Pipeline.


In [51]:
print('Step 6: Create Run.')
run = retrieval_pipeline(queries)

Step 6: Create Run.


BR(BM25): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 37.83q/s]


In [52]:
run

Unnamed: 0,qid,docid,docno,rank,score,query
0,51,23091,If7d63a15a4211a5a,0,7.471291,Do we need sex education in schools
1,51,15151,Ia2f5cc1208c1e1b5,1,7.398296,Do we need sex education in schools
2,51,12991,I8ba6d96fdccde470,2,7.254978,Do we need sex education in schools
3,51,8355,I5990ed8586db98b9,3,7.112157,Do we need sex education in schools
4,51,7792,I534bab561f11daff,4,7.093402,Do we need sex education in schools
...,...,...,...,...,...,...
2495,100,13325,I8f23cd947354c9b7,45,6.467451,Do we need cash
2496,100,6576,I463fab00c7efa7fd,46,6.451252,Do we need cash
2497,100,20930,Ie0ef788113c4bae7,47,6.451252,Do we need cash
2498,100,2902,I1ed98cb37676b90f,48,6.440203,Do we need cash
