In [13]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets
else:
    print('We are in the TIRA sandbox.')


[0m

In [19]:
# Import the required libraries
print('importing libraries...')
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
ensure_pyterrier_is_loaded()

from load_dataset import load_dataset
from create_index import create_index
from create_model import create_model
from generate_custom_stopwords import generate_custom_stopwords
print('Done. Libraries imported.')

importing libraries...
Due to execution in TIRA, I have patched ir_datasets to always return the single input dataset mounted to the sandbox.
Done. Libraries imported.


In [3]:
import uuid
from datetime import datetime
training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'

# load data
load_dataset_result = load_dataset(training_dataset)
documents, queries = load_dataset_result['documents'], load_dataset_result['queries']
print("data load")

# generate stopwords
# generate_custom_stopwords(documents)
# print("generated stopwords")

# create index
stopwords = './stopwordlists/custom_stopwords.txt'
if not os.path.exists(stopwords):
        raise ValueError('Could not find stopwords file at %s' % stopwords)

print('I will use a custom stopwords list at %s' % stopwords)
index = create_index(load_dataset(training_dataset)['documents'], 
                     {'stopwords': stopwords, 'stemmer': None})
print("index created")

# create model
model = create_model(index)
print("model created")

# run model
run_name = "stopwords-from-non-relevant-files-without-stemmer"
system = f"{run_name}-{datetime.now().strftime('%d-%m-%Y-%H-%M')}"
run = model(queries)

output_dir = 'runs/training'

run_output_dir = output_dir + '/' + system

!rm -Rf {run_output_dir}
!mkdir -p {run_output_dir}

persist_and_normalize_run(run, system, run_output_dir)

No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
data load
I will use a custom stopwords list at ./stopwordlists/custom_stopwords.txt
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
index created
model created
The run file is normalized outside the TIRA sandbox, I will store it at "runs/training/custom-stopwords-without-stemmer-14-12-2023-15-37".
Done. run file is stored under "runs/training/custom-stopwords-without-stemmer-14-12-2023-15-37/run.txt".


# Todos
- play with inverted index and look at results = index(type=IndexingType.SINGLEPASS)

# solved 
- combine static stopword list with custom stopwords 
    - could be done as test but would not be really fitting to our research question
- apply stopwords to queries
    - Answer Maik: in PyTerrier, the stopword configuration is part of the index, so if you build the index using a certain stopwordlist, it should automatically apply it to the queries.


- bigger size of stopwords
- other method of stopwords (idf or normalized)
- do evaluation for validation data

# Performance original model
- nDCG@10: 0.1773804215063869
- nDCG@10 (unjudgedRemoved): 0.5367157810929261
- MAP: 0.1186555257944954
- MRR': 0.2628142657723083

In [4]:
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()


def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-jena-leipzig-wise-2023', dataset, truth_dataset=True) + '/qrels.txt')

def evaluate_run(qrels, runFile):
    run = TrecRun(runFile)
    trec_eval = TrecEval(run, qrels)

    return {
        'run': run.get_runid(),
        'nDCG@10': trec_eval.get_ndcg(depth=10),
        'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
        'MAP': trec_eval.get_map(depth=10),
        'MRR': trec_eval.get_reciprocal_rank(),
        'P@10': trec_eval.get_precision(depth=10),
        'P': trec_eval.get_precision()
    }

def test_model(runFile):
    training_qrels = load_qrels('training-20231104-training')

    print("Overall performance:\n")
    print(evaluate_run(training_qrels, runFile))
    print("\n")

test_model(f"{run_output_dir}/run.txt")

No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Overall performance:



  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


{'run': 'custom-stopwords-without-stemmer-14-12-2023-15-37', 'nDCG@10': 0.17477708457067012, 'nDCG@10 (unjudgedRemoved)': 0.5296970226402502, 'MAP': 0.11660141778037744, 'MRR': 0.26498239884607894, 'P@10': 0.09150521609538004, 'P': 0.0032742175856929957}




# Argument

- perfect stopword list
    - contains all words that do not contribute meaning
    - does not contain words that contribute meaning

- imperfect stopword list
    - contains words that contribute meaning 
        - means reduced retrieval effectiveness bec. informative word omitted from retrieval process
    - does not contain enough words that do not contribute meaning 
        - means reduced retrieval effectiveness bec. documents that are less specific to the query would be retrieved

- standart stopword list
    - probably does not contain words that contribute meaning as its very basic
    - potentially contains not enough words that do not contribute meaning

- custom stopword list
    - enhancing the standart stopword list with words that are not contributing meaning and are specific to our dataset 
    - these words are the most common words found in retrived relevant documents 
        - because these files are relevant and terms that frequently occur in these documents potentially do not contribute much meaning

If we imagine a perfect stopword list, such a list would contain all terms from a document collection 
    that do not contribute informative meaning to a document
and no terms from a document collection 
    that do contribute informative meaning to a document
On the other hand an imperfect stopword list would contain either 
    words that contribute informative meaning (I)
or  not all words that do not contribute informative meaning (II)
or  a mixture of both.
All of these cases would result in reduced retrieval effectiveness as 
    a large amount of documents that are less specific to a given query would be retrieved (II) 
or  documents that are specific to a given query would not be retrieved (I)

As we currently use a standard stopword list from <tbd> such a list is unlikely to contain words that contribute meaning in our document collection as it simply contains the most frequent words of english language.
However this standard stopword list may potentially contain too little stopwords as our document collection might have a specific vocabulary unlike classic english language.

Thats why we conjecture that 
    enhancing a standard stopword list with a custom stopword list, would improve the retrieval effectiveness of our retrieval model.

We derive this custom stopword list by searching for the most common terms in the relevant retrieved documents from a retrieval run with the standard stopword list. 

# Algorithm
1. do retrieval with standard stopword list
2. examine all relevant retrieved documents `RD`
    - make a list of all terms in `RD` and assign each term how often they occur in the `RD`
    - choose a threshold which number of occurences mark stopwords (see other paper for method)
    - create a list with all terms over threshold
3. merge custom stopword list with standart stopword list
4. compare performance of standart, custom and merged stopword list

# Results:
- no real changes in performance when trying to just apply other stopwords

In [3]:
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-jena-leipzig-wise-2023', dataset, truth_dataset=True) + '/qrels.txt')

training_qrels = load_qrels('training-20231104-training')
runFile = "runs/training/simple-stopwords-without-stemmer-14-12-2023-15-27/run.txt"

run = TrecRun(runFile)
trec_eval = TrecEval(run, training_qrels)

relevant_retrieved_documents = trec_eval.get_relevant_retrieved_documents()

index_from_relevant_retrieved = create_index(relevant_retrieved_documents, {'stopwords': None, 'stemmer': None})

lexicon = index_from_relevant_retrieved.getLexicon()
term_frequencies = [(term, le.getFrequency()) for term, le in lexicon]
sorted_term_frequencies = sorted(term_frequencies, key=lambda x: x[1], reverse=True)

# write result to file
file_path = './stopwordlists/stopwords_from_relevant_retrieved.txt'

##### examine the correct threshold
##### choose the threshold: max difference between frequency of two terms F(r) and F(r+1)
stopword_list_length = 100

with open(file_path, 'w') as file:
        file.write("")

with open(file_path, 'a') as file:
        for term, le in sorted_term_frequencies[:stopword_list_length]:
            string_to_append = f"{term}\n"
            file.write(string_to_append)   

No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


TypeError: 'numpy.int64' object is not iterable

In [36]:
print(training_qrels.qrels_data)
# run.get_top_documents('airport', 10)

from tira.third_party_integrations import ir_datasets
import pyterrier as pt

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'

queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset), format='trecxml')

dataset = ir_datasets.load(training_dataset)
topics = dataset.queries_iter()

documents = dataset.docs_iter()

first_topic = list(topics)[1]

print(first_topic)

run.get_top_documents(first_topic.query_id, 10)
print(queries)

           query q0            docid  rel
0      q06223196  0  doc062200112743    0
1      q06223196  0  doc062200205250    0
2      q06223196  0  doc062200101983    0
3      q06223196  0  doc062200204465    1
4      q06223196  0  doc062200115614    0
...          ... ..              ...  ...
9651  q062225197  0  doc062200205276    0
9652  q062225197  0  doc062200107121    1
9653  q062225197  0  doc062200204419    0
9654  q062225197  0  doc062200103774    0
9655  q062225197  0  doc062200110087    0

[9656 rows x 4 columns]
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
TirexQuery(query_id='q062228', text='airport', title='airport', query='airport', desc

In [63]:
qrels = training_qrels.qrels_data.values.tolist()

queryIdIndex = 0
docIdIndex = 2
relevanceIndex = 3


# Filter relevant documents
relevant_documents = [item for item in qrels if item[relevanceIndex] == 1]

# Filter relevant retrieved documents
relevant_retrieved_documents = []
for item in relevant_documents:
    query_id, doc_id = item[queryIdIndex], item[docIdIndex]
    is_document_retrieved_in_top_10 = doc_id in run.get_top_documents(query_id, 10)

    if is_document_retrieved_in_top_10:
        relevant_retrieved_documents.append(doc_id)

<class 'pandas.core.frame.DataFrame'>


In [65]:
relevant_retrieved_documents

['doc062200205493',
 'doc062200111596',
 'doc062200112337',
 'doc062200203368',
 'doc062200210498',
 'doc062200107753',
 'doc062200108533',
 'doc062200206242',
 'doc062200108726',
 'doc062200209842',
 'doc062200103530',
 'doc062200205548',
 'doc062200108031',
 'doc062200105417',
 'doc062200107961',
 'doc062200116046',
 'doc062200110345',
 'doc062200100832',
 'doc062200108970',
 'doc062200201425',
 'doc062200203247',
 'doc062200200456',
 'doc062200108525',
 'doc062200201814',
 'doc062200110554',
 'doc062200111415',
 'doc062200108501',
 'doc062200104801',
 'doc062200116719',
 'doc062200106598',
 'doc062200202169',
 'doc062200209516',
 'doc062200200582',
 'doc062200110056',
 'doc062200202832',
 'doc062200115516',
 'doc062200205284',
 'doc062200116610',
 'doc062200103553',
 'doc062200101811',
 'doc062200102981',
 'doc062200203725',
 'doc062200107924',
 'doc062200202820',
 'doc062200204217',
 'doc062200208243',
 'doc062200204893',
 'doc062200111481',
 'doc062200205577',
 'doc062200102169',


In [60]:
print(relevant_documents[1])

'doc062200205493' in run.get_top_documents('q06223196', 10)

['q06223196', '0', 'doc062200205493', 1]


True

In [None]:
# choose threshold

sorted_term_frequencies = sorted(term_frequencies, key=lambda x: x[1], reverse=True)
highest_diff = {'amount': 0, 'position': None}

# typescript algorithm translated
    
sorted_terms = [
    ['car', 3],
    ['will', 4]
]

highest_diff = {'amount': 0, 'position': 0}

for i in range(len(sorted_terms)):
    current_diff = sorted_terms[i][1] - sorted_terms[i + 1][1] if i != len(sorted_terms) - 1 else 0
    highest_diff = {'amount': current_diff, 'position': i} if current_diff > highest_diff['amount'] else highest_diff

print(highest_diff)