# Einrichtung
- Installieren der Packages
- Initialisieren von Pyterrier
- Laden der Datensätze

In [None]:
!pip3 install --upgrade tira ir-datasets python-terrier 
!pip3 install --upgrade pyterrier-caching pyterrier_t5
# !pip3 install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git

In [1]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
import pyterrier as pt

if not pt.java.started():
    pt.java.init()

ensure_pyterrier_is_loaded()
tira = Client()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [2]:
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')
pt_dataset_new = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training')
pt_dataset_test = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test')

# Indizes erstellen
- für alle drei Datensätze wird ein Index erstellt

In [3]:
from pyterrier import IterDictIndexer

indexer = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index = indexer.index(pt_dataset.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:  38%|███▊      | 25752/68261 [00:04<00:06, 6770.22it/s]



ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents: 100%|██████████| 68261/68261 [00:08<00:00, 7873.27it/s] 


14:18:57.029 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents


In [4]:
indexer_new = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_new",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_new = indexer_new.index(pt_dataset_new.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents: 100%|██████████| 113227/113227 [00:31<00:00, 3557.55it/s]


In [5]:
indexer_test = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_test",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_test = indexer_test.index(pt_dataset_test.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test documents: 100%|██████████| 125112/125112 [00:43<00:00, 2903.12it/s]


# Pipelines definieren
- BM25 mit den jeweiligen Indizies
- MonoT5 in gecachter Variante
- DuoT5 in Kombination mit MonoT5 (gecacht)

In [6]:
from pyterrier import BatchRetrieve

bm25 = BatchRetrieve(index, wmodel="BM25")
bm25_new = BatchRetrieve(index_new, wmodel="BM25")
bm25_test = BatchRetrieve(index_test, wmodel="BM25")

  bm25 = BatchRetrieve(index, wmodel="BM25")
  bm25_new = BatchRetrieve(index_new, wmodel="BM25")
  bm25_test = BatchRetrieve(index_test, wmodel="BM25")


In [30]:
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
from pyterrier_caching import SparseScorerCache
_monoT5 = MonoT5ReRanker()
duoT5 = DuoT5ReRanker()

def mono_factory(cutoff, bm25, monoT5):
    return (bm25 % cutoff >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)

def duo_pipeline(cutoff, mono_pipeline):
    return (mono_pipeline % cutoff >> duoT5) ^ mono_pipeline

monoT5 = SparseScorerCache('monoT5_fix.cache', _monoT5, verbose=True)
monoT5_new = SparseScorerCache('monoT5_new_fix.cache', _monoT5, verbose=True)
monoT5_test = SparseScorerCache('monoT5_test_fix.cache', _monoT5, verbose=True)

pipeline_mono_t5 = (bm25 >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)
pipeline_duo_t5 = (pipeline_mono_t5 % 5 >> duoT5) ^ pipeline_mono_t5

# Testen auf dem Standard-Datensatz

In [33]:
from pyterrier import Experiment
# Run experiment
Experiment(
    retr_systems=[
        pipeline_mono_t5,
#        pipeline_duo_t5,
    ],
    names=[
        "BM25+monoT5",
#        "BM25+monoT5+duoT5",
    ],
    topics=pt_dataset.get_topics('text'),
    qrels=pt_dataset.get_qrels(),
    eval_metrics=["ndcg_cut_10", "P_10"],
    verbose=False,
)

monoT5: 100%|██████████| 22380/22380 [2:26:28<00:00,  2.55batches/s]  


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 1550 hit(s), 89517 miss(es)


Unnamed: 0,name,ndcg_cut_10,P_10
0,BM25+monoT5,0.727375,0.787629


# Optimieren von MonoT5 auf dem Standard-Datensatz

In [None]:
def optimize_monot5():
    max_ndcg = 0
    best_cutoff = 0
    for mono_cutoff in range(25, 1000, 25):
        pipeline_mono_t5 = (bm25 % mono_cutoff >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)
        exp = Experiment(
            [pipeline_mono_t5],
            topics = pt_dataset.get_topics('text'),
            qrels = pt_dataset.get_qrels(),
            eval_metrics =['ndcg_cut_10'],
            names=["monoT5"],
            round = 3,
            baseline=0
        )
        new_ndcg = exp['ndcg_cut_10'][0]
        if exp['ndcg_cut_10'][0] > max_ndcg:
            max_ndcg = new_ndcg
            best_cutoff = mono_cutoff
        print('Mono Cutoff:', mono_cutoff, ', NDCG@10:', exp['ndcg_cut_10'][0])
    return best_cutoff

mono_cutoff = optimize_monot5()
print("Der optimale Mono-Cutoff ist:", mono_cutoff)

Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 2405 hit(s), 0 miss(es)
Mono Cutoff: 25 , NDCG@10: 0.622
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 4783 hit(s), 0 miss(es)
Mono Cutoff: 50 , NDCG@10: 0.662
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 7158 hit(s), 0 miss(es)
Mono Cutoff: 75 , NDCG@10: 0.68
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 9533 hit(s), 0 miss(es)
Mono Cutoff: 100 , NDCG@10: 0.684
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 11908 hit(s), 0 miss(es)
Mono Cutoff: 125 , NDCG@10: 0.684
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', k

# Optimierung von DuoT5
- es wird der beste ermittelte MonoT5-Cutoff (525) genutzt und damit verschiedene DuoT5-Cutoffs (3,5,10,15) getestet
- der beste duoT5-Cutoff wird gewählt

In [None]:
def optimize_duot5(mono_cutoff = 525):
    max_ndcg = 0
    best_cutoff = 0
    for duo_cutoff in [3,5,10,15]:
        pipeline_mono_t5 = (bm25 % mono_cutoff >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)
        pipeline_duo_t5 = (pipeline_mono_t5 % duo_cutoff >> duoT5) ^ pipeline_mono_t5
        exp = Experiment(
            [pipeline_duo_t5],
            topics = pt_dataset.get_topics('text'),
            qrels = pt_dataset.get_qrels(),
            eval_metrics =['ndcg_cut_10'],
            names=["duoT5"],
            round = 3,
            baseline=0
        )
        new_ndcg = exp['ndcg_cut_10'][0]
        if exp['ndcg_cut_10'][0] > max_ndcg:
            max_ndcg = new_ndcg
            best_cutoff = duo_cutoff
        print('Duo Cutoff:', duo_cutoff, ', NDCG@10:', exp['ndcg_cut_10'][0])
    return best_cutoff

duo_cutoff = optimize_duot5(mono_cutoff)
print("Der optimale Duo-Cutoff ist:", duo_cutoff)

Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [01:12<00:00,  1.34queries/s]


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 3 , NDCG@10: 0.73
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5: 100%|██████████| 97/97 [04:23<00:00,  2.72s/queries]


Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)
Duo Cutoff: 5 , NDCG@10: 0.733
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x15047e7d0>, group='query', key='docno'): 49127 hit(s), 0 miss(es)


duoT5:  31%|███       | 30/97 [08:35<17:23, 15.58s/queries]

In [None]:
mono_cutoff = 525
duo_cutoff = 15

pipeline_mono_optimized = (bm25 % mono_cutoff >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)
pipeline_duo_optimized = (pipeline_mono_t5 % duo_cutoff >> duoT5) ^ pipeline_mono_t5


# Abgabe der Runs
- je Datensatz zwei Runs (MonoT5 und MonoT5+DuoT5)

In [None]:
run_baseline = pipeline_mono_optimized(pt_dataset.get_topics('text'))
run_baseline_new = pipeline_mono_optimized(pt_dataset_new.get_topics('text'))
run_baseline_test = pipeline_mono_optimized(pt_dataset_test.get_topics('text'))

In [None]:
run_duo = pipeline_duo_optimized(pt_dataset.get_topics('text'))
run_duo_new = pipeline_duo_optimized(pt_dataset_new.get_topics('text'))
run_duo_test = pipeline_duo_optimized(pt_dataset_test.get_topics('text'))

In [None]:
from tira.third_party_integrations import persist_and_normalize_run
persist_and_normalize_run(
    run_baseline,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset,
)
persist_and_normalize_run(
    run_baseline_new,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset_new,
)
persist_and_normalize_run(
    run_baseline_test,
    # Give your approach a short but descriptive name tag.
    system_name='monoT5-BL-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset_test,
)

In [None]:
persist_and_normalize_run(
    run_duo,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset,
)
persist_and_normalize_run(
    run_duo_new,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset_new,
)
persist_and_normalize_run(
    run_duo_test,
    # Give your approach a short but descriptive name tag.
    system_name='duoT5-suchMaschinen', 
    default_output='../data/runs',
    upload_to_tira=pt_dataset_test,
)