# Einrichtung
- Installieren der Packages
- Initialisieren von Pyterrier
- Laden der Datensätze

In [None]:
!pip3 install --upgrade tira ir-datasets python-terrier 
!pip3 install --upgrade pyterrier-caching pyterrier_t5
# !pip3 install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git

In [1]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
import pyterrier as pt

if not pt.java.started():
    pt.java.init()

ensure_pyterrier_is_loaded()
tira = Client()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [2]:
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')
pt_dataset_new = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training')
pt_dataset_test = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test')

# Indizes erstellen
- für alle drei Datensätze wird ein Index erstellt

In [3]:
from pyterrier import IterDictIndexer

indexer = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index = indexer.index(pt_dataset.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:  38%|███▊      | 25752/68261 [00:04<00:06, 6770.22it/s]



ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents: 100%|██████████| 68261/68261 [00:08<00:00, 7873.27it/s] 


14:18:57.029 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents


In [4]:
indexer_new = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_new",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_new = indexer_new.index(pt_dataset_new.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents: 100%|██████████| 113227/113227 [00:31<00:00, 3557.55it/s]


In [5]:
indexer_test = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index_test",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index_test = indexer_test.index(pt_dataset_test.get_corpus_iter())

ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test documents: 100%|██████████| 125112/125112 [00:43<00:00, 2903.12it/s]


# Pipelines definieren
- BM25 mit den jeweiligen Indizies
- MonoT5 in gecachter Variante
- DuoT5 in Kombination mit MonoT5 (gecacht)

In [6]:
from pyterrier import BatchRetrieve

bm25 = BatchRetrieve(index, wmodel="BM25")
bm25_new = BatchRetrieve(index_new, wmodel="BM25")
bm25_test = BatchRetrieve(index_test, wmodel="BM25")

  bm25 = BatchRetrieve(index, wmodel="BM25")
  bm25_new = BatchRetrieve(index_new, wmodel="BM25")
  bm25_test = BatchRetrieve(index_test, wmodel="BM25")


In [24]:
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
from pyterrier_caching import SparseScorerCache
monoT5 = MonoT5ReRanker()
duoT5 = DuoT5ReRanker()

monoT5 = SparseScorerCache('monoT5_fix.cache', monoT5, verbose=True) # Caching für MonoT5

pipeline_mono_t5 = (bm25 % 100 >> pt.text.get_text(pt_dataset, "text") >> monoT5) ^ (bm25)
pipeline_duo_t5 = (pipeline_mono_t5 % 5 >> duoT5) ^ pipeline_mono_t5

# Testen auf dem Standard-Datensatz

In [26]:
from pyterrier import Experiment
# Run experiment
Experiment(
    retr_systems=[
        pipeline_mono_t5,
        pipeline_duo_t5,
    ],
    names=[
        "BM25+monoT5",
        "BM25+monoT5+duoT5",
    ],
    topics=pt_dataset.get_topics('text').head(1),
    qrels=pt_dataset.get_qrels(),
    eval_metrics=["ndcg_cut_10", "P_10"],
    verbose=True,
)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x150498d90>, group='query', key='docno'): 100 hit(s), 0 miss(es)
Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x150498d90>, group='query', key='docno'): 100 hit(s), 0 miss(es)


duoT5: 100%|██████████| 1/1 [00:04<00:00,  4.48s/queries]
pt.Experiment: 100%|██████████| 2/2 [00:04<00:00,  2.39s/system]

Sqlite3ScorerCache('monoT5_fix.cache', <pyterrier_t5.MonoT5ReRanker object at 0x150498d90>, group='query', key='docno'): 100 hit(s), 0 miss(es)





Unnamed: 0,name,ndcg_cut_10,P_10
0,BM25+monoT5,0.627356,0.4
1,BM25+monoT5+duoT5,0.748086,0.4


# Optimieren von MonoT5 auf dem Standard-Datensatz

In [None]:
def optimize_monot5():
    mono2 = pt.text.get_text(pt_dataset, "text") >> monoT5 # Pipeline für MonoT5
    mono_cached = SparseScorerCache('monoT5.cache', mono2, verbose=True) # Caching für MonoT5
    duo2 = pt.text.get_text(pt_dataset, "text") >> duoT5 # Pipeline für DuoT5
    duo_cached = SparseScorerCache('duoT5.cache', duo2, verbose=True) # Caching für DuoT5
    experiment = []
    for mono_cutoff in range(25, 5000, 25):
        inp = (bm25 % mono_cutoff).transform(pt_dataset.get_topics('text').head(first_x_topics)) # BM25 auf den ersten x Topics anwenden. Dabei cutoff von 100 Dokumenten
        mono_results = mono_cached.transform(inp) # MonoT5 wird auf die Ergebnisse von BM25 angewendet

        for duo_cutoff in range(5, 6):
            mono_results_cutoff = (mono_cached % duo_cutoff).transform(inp)
            duo_results = duo_cached.transform(mono_results_cutoff)
            exp = Experiment(
                [mono_results, duo_results],
                topics = pt_dataset.get_topics('text'),
                qrels = pt_dataset.get_qrels(),
                eval_metrics =['ndcg_cut_5', 'ndcg_cut_10', "mrt"],
                names=["monoT5", "monoT5+duoT5"],
                round = 3,
                baseline=0
            )
            experiment.append({'mono_cutoff': mono_cutoff, 'duo_cutoff': duo_cutoff, 'mono_ndcg_5': exp['ndcg_cut_5'][0], 'duo_ndcg_5': exp['ndcg_cut_5'][1], 'mono_ndcg': exp['ndcg_cut_10'][0], 'duo_ndcg': exp['ndcg_cut_10'][1], 'p_value': exp['ndcg_cut_10 p-value'][1]})
            print('Mono Cutoff:', mono_cutoff, ', Duo Cutoff:', duo_cutoff, ', NDCG@10 MonoT5:', exp['ndcg_cut_10'][0], ', NDCG@10 DuoT5:', exp['ndcg_cut_10'][1], ', p-value:', exp['ndcg_cut_10 p-value'][1])
    return experiment

experiment = run_test()

# Abgabe der Runs
- je Datensatz zwei Runs (MonoT5 und MonoT5+DuoT5)

In [None]:
run_baseline = bm25(pt_dataset.get_topics('text'))
run_baseline_new = bm25_new(pt_dataset_new.get_topics('text'))
run_baseline_test = bm25_test(pt_dataset_test.get_topics('text'))