# Einrichtung
- Installieren der Packages
- Initialisieren von Pyterrier
- Laden der Datensätze

In [1]:
!pip3 install --upgrade tira ir-datasets python-terrier 
!pip3 install --upgrade pyterrier-caching pyterrier_t5
# !pip3 install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git



In [2]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
import pyterrier as pt

if not pt.java.started():
    pt.java.init()

ensure_pyterrier_is_loaded()
tira = Client()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [3]:
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')
pt_dataset_new = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training')
pt_dataset_test = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test')

# Indizes erstellen
- für alle drei Datensätze wird ein Index erstellt

In [4]:
from pyterrier import IterDictIndexer

indexer = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index = indexer.index(pt_dataset.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:  36%|███▌      | 24291/68261 [00:02<00:03, 14632.68it/s]



ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents: 100%|██████████| 68261/68261 [00:04<00:00, 14525.80it/s]


22:05:56.529 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents


In [13]:
indexer_new = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_new",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_new = indexer_new.index(pt_dataset_new.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents: 100%|██████████| 113227/113227 [00:30<00:00, 3721.05it/s]


In [6]:
indexer_test = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_test",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_test = indexer_test.index(pt_dataset_test.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test documents: 100%|██████████| 125112/125112 [00:27<00:00, 4575.25it/s]


# Pipelines definieren
- BM25 mit den jeweiligen Indizies
- MonoT5 in gecachter Variante
- DuoT5 in Kombination mit MonoT5 (gecacht)

In [14]:
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
bm25_new = pt.terrier.Retriever(index_new, wmodel="BM25")
bm25_test = pt.terrier.Retriever(index_test, wmodel="BM25")

In [40]:
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
from pyterrier_caching import SparseScorerCache
_monoT5 = MonoT5ReRanker()
duoT5 = DuoT5ReRanker()

def mono_factory(cutoff, bm25_base, mono_t5, dataset):
    return (bm25_base % cutoff >> pt.text.get_text(dataset, "text") >> mono_t5) ^ bm25_base

def duo_factory(cutoff, mono_pipeline):
    return (mono_pipeline % cutoff >> duoT5) ^ mono_pipeline

monoT5 = SparseScorerCache('monoT5.cache', _monoT5, verbose=True)
monoT5_new = SparseScorerCache('monoT5_new.cache', _monoT5, verbose=True)
monoT5_test = SparseScorerCache('monoT5_test.cache', _monoT5, verbose=True)

# Testen auf dem Standard-Datensatz

In [9]:
from pyterrier import Experiment
pipeline_mono_t5 = mono_factory(1000, bm25, monoT5, pt_dataset)
pipeline_duo_t5 = duo_factory(5, pipeline_mono_t5)
# Run experiment
Experiment(
    retr_systems=[
        pipeline_mono_t5,
#        pipeline_duo_t5,
    ],
    names=[
        "BM25+monoT5",
#        "BM25+monoT5+duoT5",
    ],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    eval_metrics=["ndcg_cut_10", "P_10"],
    verbose=False,
)

Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x1470fe9b0>, group='query', key='docno'): 91067 hit(s), 0 miss(es)


Unnamed: 0,name,ndcg_cut_10,P_10
0,BM25+monoT5,0.727375,0.787629


# Optimieren von MonoT5 auf dem Standard-Datensatz

In [39]:
def optimize_monot5():
    max_ndcg = 0
    best_cutoff = 0
    for mono_cutoff in range(25, 1000, 25):
        pipeline_mono_t5 = mono_factory(mono_cutoff, bm25, monoT5, pt_dataset)
        exp = Experiment(
            [pipeline_mono_t5],
            topics = pt_dataset.get_topics('text'),
            qrels = pt_dataset.get_qrels(),
            eval_metrics =['ndcg_cut_10'],
            names=["monoT5"],
            round = 3,
            baseline=0
        )
        new_ndcg = exp['ndcg_cut_10'][0]
        if exp['ndcg_cut_10'][0] > max_ndcg:
            max_ndcg = new_ndcg
            best_cutoff = mono_cutoff
        print('Mono Cutoff:', mono_cutoff, ', NDCG@10:', exp['ndcg_cut_10'][0])
    return best_cutoff

mono_cutoff = optimize_monot5()
print("Der optimale Mono-Cutoff ist:", mono_cutoff)

Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 2405 hit(s), 0 miss(es)
Mono Cutoff: 25 , NDCG@10: 0.622
Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 4783 hit(s), 0 miss(es)
Mono Cutoff: 50 , NDCG@10: 0.662
Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 7158 hit(s), 0 miss(es)
Mono Cutoff: 75 , NDCG@10: 0.68
Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 9533 hit(s), 0 miss(es)
Mono Cutoff: 100 , NDCG@10: 0.684
Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 11908 hit(s), 0 miss(es)
Mono Cutoff: 125 , NDCG@10: 0.684
Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x324f68190>, group='query', key='docno'): 14283 hit(s

# Optimierung von DuoT5 auf dem Standard-Datensatz
- es wird der beste ermittelte MonoT5-Cutoff (525) genutzt und damit verschiedene DuoT5-Cutoffs (3,5,10,15) getestet
- der beste duoT5-Cutoff wird gewählt

In [35]:
def optimize_duot5(mono_cutoff = 525):
    max_ndcg = 0
    best_cutoff = 0
    pipeline_mono_t5 = mono_factory(mono_cutoff, bm25, monoT5, pt_dataset)
    for duo_cutoff in [3,5,10,15]:
        pipeline_duo_t5 = duo_factory(duo_cutoff, pipeline_mono_t5)
        exp = Experiment(
            [pipeline_duo_t5],
            topics = pt_dataset.get_topics('text'),
            qrels = pt_dataset.get_qrels(),
            eval_metrics =['ndcg_cut_10'],
            names=["duoT5"],
            round = 3,
            baseline=0
        )
        new_ndcg = exp['ndcg_cut_10'][0]
        if exp['ndcg_cut_10'][0] > max_ndcg:
            max_ndcg = new_ndcg
            best_cutoff = duo_cutoff
        print('Duo Cutoff:', duo_cutoff, ', NDCG@10:', exp['ndcg_cut_10'][0])
    return best_cutoff

duo_cutoff = optimize_duot5(mono_cutoff)
print("Der optimale Duo-Cutoff ist:", duo_cutoff)

Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [01:12<00:00,  1.34queries/s]


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 3 , NDCG@10: 0.73
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [04:23<00:00,  2.72s/queries]


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 5 , NDCG@10: 0.733
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [27:07<00:00, 16.77s/queries]


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 10 , NDCG@10: 0.735
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [56:19<00:00, 34.84s/queries]  


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 15 , NDCG@10: 0.754
Der optimale Duo-Cutoff ist: 15


In [10]:
mono_cutoff = 525
duo_cutoff = 15

# Abgabe der Runs
- je Datensatz zwei Runs (MonoT5 und MonoT5+DuoT5)

Run für Standard-Datensatz

In [36]:
from tira.third_party_integrations import persist_and_normalize_run

pipeline = mono_factory(mono_cutoff, bm25, monoT5, pt_dataset)
run = pipeline(pt_dataset.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/run_bl_standard',
    upload_to_tira=pt_dataset,
)

Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
The run file is normalized outside the TIRA sandbox, I will store it at "../data/run_bl_standard".
Done. run file is stored under "../data/run_bl_standard/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/fedf35dc-f3b2-4d1b-b1ad-b4e617eb437f


In [33]:
duo_pipeline = duo_factory(duo_cutoff, pipeline)
run = duo_pipeline(pt_dataset.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/run_duo_standard',
    upload_to_tira=pt_dataset,
)

Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [3:06:10<00:00, 115.16s/queries]  


Sqlite3ScorerCache('monoT5.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
The run file is normalized outside the TIRA sandbox, I will store it at "../data/run_duo_standard".
Done. run file is stored under "../data/run_duo_standard/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/553c3bf5-99fc-429d-b6b3-7426fc34b965


Run für Traingsdatensatz Neu

In [None]:
pipeline = mono_factory(mono_cutoff, bm25_new, monoT5_new, pt_dataset_new)
run = pipeline(pt_dataset_new.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/run_bl_new',
    upload_to_tira=pt_dataset_new,
)

Sqlite3ScorerCache('monoT5_new.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 45150 hit(s), 0 miss(es)


"persist_and_normalize_run(\n    run,\n    # Give your approach a short but descriptive name tag.\n    system_name='monoT5-BL-suchMaschinen', \n    default_output='../data/run_bl_new',\n    upload_to_tira=pt_dataset_new,\n)"

In [38]:
duo_pipeline = duo_factory(duo_cutoff, pipeline)
run = duo_pipeline(pt_dataset_new.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/run_duo_new',
    upload_to_tira=pt_dataset_new,
)

Sqlite3ScorerCache('monoT5_new.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 45150 hit(s), 0 miss(es)


duoT5: 100%|██████████| 86/86 [2:57:43<00:00, 123.99s/queries]  


Sqlite3ScorerCache('monoT5_new.cache', <pyterrier_t5.MonoT5ReRanker object at 0x3581e5720>, group='query', key='docno'): 45150 hit(s), 0 miss(es)
The run file is normalized outside the TIRA sandbox, I will store it at "../data/run_duo_new".
Done. run file is stored under "../data/run_duo_new/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/8df732f5-2a9a-4c3e-9ef1-5d9cb6939cd9


Run für Testdatensatz

In [41]:
pipeline = mono_factory(mono_cutoff, bm25_test, monoT5_test, pt_dataset_test)
run = pipeline(pt_dataset_test.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/run_bl_test',
    upload_to_tira=pt_dataset_test,
)

Sqlite3ScorerCache('monoT5_test.cache', <pyterrier_t5.MonoT5ReRanker object at 0x35e32f3d0>, group='query', key='docno'): 24048 hit(s), 0 miss(es)
The run file is normalized outside the TIRA sandbox, I will store it at "../data/run_bl_test".
Done. run file is stored under "../data/run_bl_test/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/ac8c268a-68a7-4703-a592-d71b739d6d01


In [42]:
duo_pipeline = duo_factory(duo_cutoff, pipeline)
run = duo_pipeline(pt_dataset_test.get_topics('text'))
persist_and_normalize_run(
    run,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/run_duo_test',
    upload_to_tira=pt_dataset_test,
)

Sqlite3ScorerCache('monoT5_test.cache', <pyterrier_t5.MonoT5ReRanker object at 0x35e32f3d0>, group='query', key='docno'): 24048 hit(s), 0 miss(es)


duoT5: 100%|██████████| 46/46 [1:11:01<00:00, 92.63s/queries] 


Sqlite3ScorerCache('monoT5_test.cache', <pyterrier_t5.MonoT5ReRanker object at 0x35e32f3d0>, group='query', key='docno'): 24048 hit(s), 0 miss(es)
The run file is normalized outside the TIRA sandbox, I will store it at "../data/run_duo_test".
Done. run file is stored under "../data/run_duo_test/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/75ed84ca-690a-40d9-96da-dc1f930fc8ae
