# install deps

In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets
else:
    print('We are in the TIRA sandbox.')


# import libs

In [None]:
# Import the required libraries
print('importing libraries...')
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
ensure_pyterrier_is_loaded()

from load_dataset import load_dataset
from create_index import create_index
from create_model import create_model
from generate_custom_stopwords_improved import generate_custom_stopwords_improved
print('Done. Libraries imported.')

In [None]:
import glob
from create_index import create_index
from load_dataset import load_dataset
from datetime import datetime
import json

folder_path = './stopwordlists/merged/'
pattern = 'merged_*.txt'
merged_stopword_files = glob.glob(folder_path + pattern)
training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
load_dataset_result = load_dataset(training_dataset)
queries = load_dataset(training_dataset)['queries']

for improved_stopwords in merged_stopword_files:
    run_name = improved_stopwords.replace("./stopwordlists/merged/", '')
    config = {'stopwords': improved_stopwords, 'stemmer': None}
    print("RUN FOR STOPWORDS:", run_name)

    improved_index = create_index(load_dataset(training_dataset)['documents'], config)
    print("index created")

    improved_model = create_model(improved_index)
    print("model created")

    run = improved_model(queries)

    output_dir = 'runs/training/improved-stopwords'
    run_output_dir = output_dir + '/' + run_name

    !rm -Rf {run_output_dir}
    !mkdir -p {run_output_dir}

    persist_and_normalize_run(run, run_name, run_output_dir)

    !touch {run_output_dir}/config.txt

    with open(f"{run_output_dir}/config.txt", 'w') as output_file:
        output_file.write(f"{json.dumps(config)} -- stopwords length = {len(improved_stopwords)}")

In [2]:
import glob

folder_path = 'runs/training/improved-stopwords/'
pattern = '**/run.txt'
files = glob.glob(folder_path + pattern)

print(files)
len(files)

['runs/training/improved-stopwords/merged_17711.txt/run.txt', 'runs/training/improved-stopwords/merged_3.txt/run.txt', 'runs/training/improved-stopwords/merged_610.txt/run.txt', 'runs/training/improved-stopwords/merged_987.txt/run.txt', 'runs/training/improved-stopwords/merged_8.txt/run.txt', 'runs/training/improved-stopwords/merged_1597.txt/run.txt', 'runs/training/improved-stopwords/merged_233.txt/run.txt', 'runs/training/improved-stopwords/merged_89.txt/run.txt', 'runs/training/improved-stopwords/merged_10946.txt/run.txt', 'runs/training/improved-stopwords/merged_13.txt/run.txt', 'runs/training/improved-stopwords/merged_2584.txt/run.txt', 'runs/training/improved-stopwords/merged_2.txt/run.txt', 'runs/training/improved-stopwords/merged_21.txt/run.txt', 'runs/training/improved-stopwords/merged_377.txt/run.txt', 'runs/training/improved-stopwords/merged_34.txt/run.txt', 'runs/training/improved-stopwords/merged_55.txt/run.txt', 'runs/training/improved-stopwords/merged_6765.txt/run.txt', 

19

# compare to english long text without stemmer model

In [1]:
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

folder_path = 'runs/training/improved-stopwords/'
pattern = '**/run.txt'
files = glob(folder_path + pattern)


def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-jena-leipzig-wise-2023', dataset, truth_dataset=True) + '/qrels.txt')

def evaluate_run(qrels, runFile):
    run = TrecRun(runFile)
    trec_eval = TrecEval(run, qrels)

    print(trec_eval.evaluate_all())

    return {
        'run': run.get_runid(),
        'nDCG@10': trec_eval.get_ndcg(depth=10),
        'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
        'MAP': trec_eval.get_map(depth=10),
        'MRR': trec_eval.get_reciprocal_rank(),
        'P@10': trec_eval.get_precision(depth=10),
        'P': trec_eval.get_precision()
    }

def test_model(runFile):
    training_qrels = load_qrels('training-20231104-training')

    print("Overall performance:\n")
    print(evaluate_run(training_qrels, runFile))
    print("\n")

result = []

for file in files:
    print("file: ", file)
    result += [evaluate_run(load_qrels('training-20231104-training'), file)]
    print("-----------------DONE-----------------")

print("Reference:")
reference_file = 'runs/standard_stopwords/run.txt'
result += [evaluate_run(load_qrels('training-20231104-training'), reference_file)]

df = pd.DataFrame(result)
df.sort_values('nDCG@10', ascending=False)


No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
file:  runs/training/improved-stopwords/merged_17711.txt/run.txt


  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


Data file not set yet


  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


-----------------DONE-----------------
file:  runs/training/improved-stopwords/merged_3.txt/run.txt


KeyboardInterrupt: 

In [3]:
def evaluate_run(qrels, runFile):
    run = TrecRun(runFile)
    trec_eval = TrecEval(run, qrels)

    ps = {}
    ndcg = {}
        for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
            ps[v] = self.get_precision(depth=v, per_query=False, trec_eval=True)
            ndcg[v] = self.get_ndcg(depth=v, per_query=False, trec_eval=True)
        map_ = self.get_map(depth=10000, per_query=False, trec_eval=True)
        gm_map_ = self.get_geometric_map(depth=10000, trec_eval=True)
        bpref_ = self.get_bpref(depth=1000, per_query=False, trec_eval=True)
        rprec_ = self.get_rprec(depth=1000, per_query=False, trec_eval=True)
        recip_rank_ = self.get_reciprocal_rank(depth=1000, per_query=False, trec_eval=True)

    print(trec_eval.evaluate_all())

    return {
        'run': run.get_runid(),
        'nDCG@10': trec_eval.get_ndcg(depth=10),
        'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
        'MAP': trec_eval.get_map(depth=10),
        'MRR': trec_eval.get_reciprocal_rank(),
        'P@10': trec_eval.get_precision(depth=10),
        'P': trec_eval.get_precision()
    }


evaluate_run(load_qrels('training-20231104-training'), file)

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


Data file not set yet


  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


{'run': 'merged_17711.txt',
 'nDCG@10': 0.17477708457067012,
 'nDCG@10 (unjudgedRemoved)': 0.5296970226402502,
 'MAP': 0.11660141778037744,
 'MRR': 0.26498239884607894,
 'P@10': 0.09150521609538004,
 'P': 0.0032742175856929957}

In [4]:

folder_path = './stopwordlists/merged/'
pattern = 'merged_*.txt'
merged_stopword_files = glob.glob(folder_path + pattern)

print(merged_)