# Information Retreival  

Created by : FrancoisHUP
Last update : 18 dec 2023

We are doing indexation on TREC AP 88-90 documents and querying the system with 150 requests.
Then we check the MAP score. 

0. Utils
1. Config env
2. Data loading
3. Tokenization
4. Indexing
5. Search
6. Evaluation 

It create an index for each tokenisation methode.  

## Config env

TODO config section where we install lucene and other requirements 

In [None]:
import lucene
lucene.initVM()

## Data preparation

Read collection data 

In [1]:
import glob
import gzip
import re

doc_pattern = re.compile(r'<DOC>(.*?)</DOC>', re.DOTALL)
docno_pattern = re.compile(r'<DOCNO>\s*(.*?)\s*</DOCNO>')
head_pattern = re.compile(r'<HEAD>\s*(.*?)\s*</HEAD>')
text_pattern = re.compile(r'<TEXT>\s*(.*?)\s*</TEXT>', re.DOTALL)

def get_documents() :
    """
    return a dictionary of documents with key = doc_id and value = {'title': title, 'text': text}. 
    output example :
        documents["AP880212-0001"] = 
        {'title': 'Reports Former Saigon Officials Released from Re-education Camp', 
        'text': "More than 150 former officers of the\noverthrown ..."}
    """
    documents_metadata = {}
    # Get a list of all .gz files in the "Ap" directory
    file_list = glob.glob('TREC AP 88-90/TREC AP 88-90/collection de documents/AP/*.gz') # start from ../src

    # Loop over the list of files
    for filename in file_list:
        
        # Open the .gz file
        with gzip.open(filename, 'rt', encoding='latin1') as file:  # 'rt' mode for text reading
            # Read the contents of the file
            content = file.read()
            for doc in doc_pattern.finditer(content):
                doc_content = doc.group(1)

                # Extracting individual elements
                doc_id = docno_pattern.search(doc_content).group(1)
                head = head_pattern.search(doc_content)
                text = text_pattern.search(doc_content)
                
                documents_metadata[doc_id] = {
                    'title': head.group(1) if head else 'Default Title',
                    'text': text.group(1) if text else 'Default text'
                }  

    return documents_metadata

documents = get_documents()
print(documents["AP880212-0001"]) 

{'title': 'Reports Former Saigon Officials Released from Re-education Camp', 'text': "More than 150 former officers of the\noverthrown South Vietnamese government have been released from a\nre-education camp after 13 years of detention, the official Vietnam\nNews Agency reported Saturday.\n   The report from Hanoi, monitored in Bangkok, did not give\nspecific figures, but said those freed Friday included an\nex-Cabinet minister, a deputy minister, 10 generals, 115\nfield-grade officers and 25 chaplains.\n   It quoted Col. Luu Van Ham, director of the Nam Ha camp south of\nHanoi, as saying all 700 former South Vietnamese officials who had\nbeen held at the camp now have been released.\n   They were among 1,014 South Vietnamese who were to be released\nfrom re-education camps under an amnesty announced by the Communist\ngovernment to mark Tet, the lunar new year that begins Feb. 17.\n   The Vietnam News Agency report said many foreign journalists and\na delegation from the Australia-Viet

Read requests

In [2]:
import glob
import gzip
import re

topic_pattern = re.compile(r'<top>(.*?)</top>', re.DOTALL)
# Regular expressions for individual elements
num_pattern = re.compile(r'<num>\s*Number:\s*(\d+)')
title_pattern = re.compile(r'<title>\s*Topic:\s*(.*?)\s*\n')
desc_pattern = re.compile(r'<desc>\s*Description:\s*(.*?)\s*<narr>', re.DOTALL)

def get_requests() :
    """
    return a dictionary of requests with key = request_id and value = {'title': title, 'desc': desc}.
    output example :
        requests["001"] = {'title': 'Antitrust Cases Pending', 'desc': 'Document discusses a pending antitrust case.'}
    """
    requests_metadata = {}
    
    # Get a list of all topics files in the "Topics-requetes" directory
    file_list = glob.glob('TREC AP 88-90/TREC AP 88-90/Topics-requetes/*') 
    # Loop over the list of files
    for filename in file_list:

        # Open the .gz file
        with open(filename, 'r') as file:
            # Read the content of the file
            topic_requests_string = file.read()
            for topic in topic_pattern.finditer(topic_requests_string):
                topic_content = topic.group(1)

                # Extracting individual elements
                num = num_pattern.search(topic_content)
                title = title_pattern.search(topic_content)
                desc = desc_pattern.search(topic_content)
                
                if(num) :
                    requests_metadata[num.group(1)] = {
                        'title': title.group(1) if title else None,
                        'desc': desc.group(1).strip() if desc else None
                    }
        
    return requests_metadata 
requests = get_requests() 
print("Total number of request : ", len(requests))
print(requests["001"])

Total number of request :  150
{'title': 'Antitrust Cases Pending', 'desc': 'Document discusses a pending antitrust case.'}


In [4]:
def build_request(req_lenght, requests) :
  built_requests = {}
  for request_id,request_data in requests.items() :  
    request_string = request_data['title']
    if req_lenght=="long" : 
      request_string += " " + request_data['desc']
    built_requests[request_id] = request_string
  return built_requests
    
# Build requests 
short_requests = build_request('short', requests)       
long_requests = build_request('long', requests)   
print("Short request : ", short_requests["001"])    
print("Long request : ", long_requests["001"])    

Short request :  Antitrust Cases Pending
Long request :  Antitrust Cases Pending Document discusses a pending antitrust case.


## Tokenize documents

In [38]:
# Import lucen and init VM. This should be done only once.
import lucene
lucene.initVM()

ValueError: JVM is already running and updating its classpath failed. Call initVM() instead just once but with a classpath keyword argument set to the module.CLASSPATH strings of all the JCC extension modules to be imported by this process

In [8]:
# Download a stopword list from nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stop_words = stopwords.words('english')
print(len(nltk_stop_words))
print(nltk_stop_words)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

[nltk_data] Downloading package stopwords to /home/frank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [243]:
import java
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.apache.lucene.analysis.core import StopFilter
from org.apache.lucene.analysis import CharArraySet
from org.apache.lucene.analysis.en import PorterStemFilter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    words = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
    return lemmas #' '.join(lemmas)

def tokenize(text, preprocess_method):
    tokens = []
    if preprocess_method == "lemmatization":
        tokens = lemmatize(text)
    else : 
        analyzer = StandardAnalyzer()
        stream = analyzer.tokenStream(None, text)
        # Stemming 
        if preprocess_method == "stemming":
            stream = PorterStemFilter(stream)

        # Stop words
        stop_words_list = java.util.ArrayList()
        for word in nltk_stop_words: #["and", "is", "the", "this"]
            stop_words_list.add(word)
        stop_words = CharArraySet(stop_words_list, True)
        stream = StopFilter(stream, stop_words)

        term = stream.getAttribute(CharTermAttribute.class_)
        stream.reset()

        while stream.incrementToken():
            tokens.append(term.toString())
        stream.end()
        analyzer.close()

    return tokens

# Example of tokenization
test_document = "This is a sample document."
tokens = tokenize(test_document, "stemming")
print(tokens)

['thi', 'sampl', 'document']


### Base tokenisation 

In [53]:
# 3m18s
preprocess_base_docs = {}
for doc_id, doc_data in documents.items():
    preprocess_base_docs[doc_id] = {}
    preprocess_base_docs[doc_id]['title'] = doc_data['title']
    preprocess_base_docs[doc_id]['tokens'] = tokenize(doc_data['title'] + doc_data['text'], "basic")
print(preprocess_base_docs["AP880212-0001"])

{'title': 'Reports Former Saigon Officials Released from Re-education Camp', 'tokens': ['reports', 'former', 'saigon', 'officials', 'released', 'education', 'campmore', '150', 'former', 'officers', 'overthrown', 'south', 'vietnamese', 'government', 'released', 'education', 'camp', '13', 'years', 'detention', 'official', 'vietnam', 'news', 'agency', 'reported', 'saturday', 'report', 'hanoi', 'monitored', 'bangkok', 'give', 'specific', 'figures', 'said', 'freed', 'friday', 'included', 'ex', 'cabinet', 'minister', 'deputy', 'minister', '10', 'generals', '115', 'field', 'grade', 'officers', '25', 'chaplains', 'quoted', 'col', 'luu', 'van', 'ham', 'director', 'nam', 'ha', 'camp', 'south', 'hanoi', 'saying', '700', 'former', 'south', 'vietnamese', 'officials', 'held', 'camp', 'released', 'among', '1,014', 'south', 'vietnamese', 'released', 'education', 'camps', 'amnesty', 'announced', 'communist', 'government', 'mark', 'tet', 'lunar', 'new', 'year', 'begins', 'feb', '17', 'vietnam', 'news', 

### Lemmatization tokenisation 

In [64]:
# 14m20s
preprocess_lemme_docs = {}
for doc_id, doc_data in documents.items():
    preprocess_lemme_docs[doc_id] = {}
    preprocess_lemme_docs[doc_id]['title'] = doc_data['title']
    preprocess_lemme_docs[doc_id]['tokens'] = tokenize(doc_data['title'] + doc_data['text'], "lemmatization")
print(preprocess_lemme_docs["AP880212-0001"])

{'title': 'Reports Former Saigon Officials Released from Re-education Camp', 'tokens': ['Reports', 'Former', 'Saigon', 'Officials', 'Released', 'from', 'Re-education', 'CampMore', 'than', '150', 'former', 'officer', 'of', 'the', 'overthrow', 'South', 'Vietnamese', 'government', 'have', 'be', 'release', 'from', 'a', 're-education', 'camp', 'after', '13', 'years', 'of', 'detention', ',', 'the', 'official', 'Vietnam', 'News', 'Agency', 'report', 'Saturday', '.', 'The', 'report', 'from', 'Hanoi', ',', 'monitor', 'in', 'Bangkok', ',', 'do', 'not', 'give', 'specific', 'figure', ',', 'but', 'say', 'those', 'free', 'Friday', 'include', 'an', 'ex-Cabinet', 'minister', ',', 'a', 'deputy', 'minister', ',', '10', 'general', ',', '115', 'field-grade', 'officer', 'and', '25', 'chaplains', '.', 'It', 'quote', 'Col.', 'Luu', 'Van', 'Ham', ',', 'director', 'of', 'the', 'Nam', 'Ha', 'camp', 'south', 'of', 'Hanoi', ',', 'as', 'say', 'all', '700', 'former', 'South', 'Vietnamese', 'officials', 'who', 'have

### Stemming tokenisation 

In [65]:
#6m9s
preprocess_stemme_docs = {}
for doc_id, doc_data in documents.items():
    preprocess_stemme_docs[doc_id] = {}
    preprocess_stemme_docs[doc_id]['title'] = doc_data['title']
    preprocess_stemme_docs[doc_id]['tokens'] = tokenize(doc_data['title'] + doc_data['text'], "stemming")
print(preprocess_stemme_docs["AP880212-0001"])

{'title': 'Reports Former Saigon Officials Released from Re-education Camp', 'tokens': ['report', 'former', 'saigon', 'offici', 'releas', 'educ', 'campmor', '150', 'former', 'offic', 'overthrown', 'south', 'vietnames', 'govern', 'releas', 'educ', 'camp', '13', 'year', 'detent', 'offici', 'vietnam', 'new', 'agenc', 'report', 'saturdai', 'report', 'hanoi', 'monitor', 'bangkok', 'give', 'specif', 'figur', 'said', 'freed', 'fridai', 'includ', 'ex', 'cabinet', 'minist', 'deputi', 'minist', '10', 'gener', '115', 'field', 'grade', 'offic', '25', 'chaplain', 'quot', 'col', 'luu', 'van', 'ham', 'director', 'nam', 'ha', 'camp', 'south', 'hanoi', 'sai', '700', 'former', 'south', 'vietnames', 'offici', 'held', 'camp', 'releas', 'among', '1,014', 'south', 'vietnames', 'releas', 'educ', 'camp', 'amnesti', 'announc', 'communist', 'govern', 'mark', 'tet', 'lunar', 'new', 'year', 'begin', 'feb', '17', 'vietnam', 'new', 'agenc', 'report', 'said', 'mani', 'foreign', 'journalist', 'deleg', 'australia', 'v

### Dump variable into disk

When tokenizing text, it create alot of memory which is store into the ram. 
At some point you want to clear the ram. Here is a way to dump the objects into a file.    

In [None]:
import pickle

# open the file in write-binary mode and dump the variable into it
with open("objects/preprocess_base_docs.pkl", "wb") as f:
    pickle.dump(preprocess_base_docs, f)

with open("objects/preprocess_lemme_docs.pkl", "wb") as f:
    pickle.dump(preprocess_lemme_docs, f)

with open("objects/preprocess_stemme_docs.pkl", "wb") as f:
    pickle.dump(preprocess_stemme_docs, f)

Here is a way to retreive the variables object 

In [None]:
import pickle

# specify the file you want to load the variable from
file = "objects/preprocess_base_docs.pkl"
# open the file in read-binary mode and load the variable from it
# with open(file, "rb") as f:
#     preprocess_base_docs = pickle.load(f)

# file = "objects/preprocess_lemme_docs.pkl"
# with open(file, "rb") as f:
#     preprocess_lemme_docs = pickle.load(f)

file = "objects/preprocess_stemme_docs.pkl"
with open(file, "rb") as f:
    preprocess_stemme_docs = pickle.load(f)

## Indexing

### Clean existing index 
Delete existing index. If the index is not deleted, it will add more document to allready existing index.  

In [5]:
import os
import shutil

# delete the directory if it exists
# if os.path.exists("index_base"):
#     shutil.rmtree("index_base")
# if os.path.exists("index_lemme"):
#     shutil.rmtree("index_lemme")
# if os.path.exists("index_stemme"):
#     shutil.rmtree("index_stemme")

### Create index

In [5]:
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import MMapDirectory
from org.apache.lucene.document import Document, Field, FieldType

# Create an on-disk index using MMapDirectory
def add_doc(w, doc_id, title, content):
    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(False)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(False)
    t2.setTokenized(True)
    t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    doc = Document()
    doc.add(Field("doc_id", doc_id, t1))
    doc.add(Field("title", title, t1))
    doc.add(Field("contents", content, t2))
    w.addDocument(doc)

def index(preprocess_documents, index_path) :
    index = MMapDirectory(index_path)
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(index, config)

    try : 
        for doc_id, preprocess_document in preprocess_documents.items(): 
            # Add some documents
            tokens = ''.join(preprocess_document["tokens"])
            add_doc(writer, doc_id, preprocess_document["title"], tokens)
        # Commit and close the writer
    except Exception as e:  
        print("Exception when indexing document : ", doc_id)
        print(e)
    writer.close()

### Index base token

In [15]:
from java.nio.file import Paths
index(preprocess_base_docs, Paths.get("index_basic"))

: 

### Index lemme token

In [6]:
from java.nio.file import Paths
index(preprocess_lemme_docs, Paths.get("index_lemmatization"))

: 

### Index stemme token

In [6]:
from java.nio.file import Paths
index(preprocess_stemme_docs, Paths.get("index_stemming"))

: 

## Search

### Check the number of documents register in the index

In [176]:
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory
from java.nio.file import Paths

# specify the path to the index directory
index_dir = "./index_basic/"

# create a directory object for the index directory
dir = FSDirectory.open(Paths.get(index_dir))

# open an index reader
reader = DirectoryReader.open(dir)

# get the number of documents in the index
num_docs = reader.numDocs()

print(f"The index contains {num_docs} documents.")

# close the index reader
reader.close()

The index contains 242918 documents.


### Single search

In [252]:
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search.similarities import ClassicSimilarity, BM25Similarity, LMJelinekMercerSimilarity, BooleanSimilarity
from org.apache.lucene.index import  DirectoryReader
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser

### SEARCHING ### 
def set_similirity_weight_schemat(searcher, weight_schemat) :
    if weight_schemat == "classic" : 
        searcher.setSimilarity(ClassicSimilarity())
    if weight_schemat == "bm25" : 
        searcher.setSimilarity(BM25Similarity(1.2,0.75))
    if weight_schemat == "LM" : 
        searcher.setSimilarity(LMJelinekMercerSimilarity(0.7))
    if weight_schemat == "boolean" : 
        searcher.setSimilarity(BooleanSimilarity())

def search(request, preprocess_method, weight_schemat):
    path = "index_" + preprocess_method + "/"
    directory = FSDirectory.open(File(path).toPath())
    analyzer = StandardAnalyzer()
    searcher = IndexSearcher(DirectoryReader.open(directory))
    set_similirity_weight_schemat(searcher, weight_schemat)
    preprocess_request = " ".join(tokenize(request, preprocess_method))

    query_string = preprocess_request.replace("/", "").replace("?", "").replace("`", "").replace("(", "").replace(")", "")
    
    try :
        query = QueryParser("contents", analyzer).parse(query_string)
        # print("QUERY : ", query.toString())
    except Exception:
        print("Error when parse '", query_string, "'" )

    # TODO replace 100000 with 1000
    scoreDocs = searcher.search(query, 100000).scoreDocs

    # Stats and explanation
    # print("%s total matching documents." % len(scoreDocs))
    # for scoreDoc in scoreDocs:
    #     doc = searcher.doc(scoreDoc.doc)
    #     print(doc.get("doc_id"), doc.get("title"), scoreDoc.score)
    #     explanation = searcher.explain(query, scoreDoc.doc)
    #     print(explanation)

    return scoreDocs


Try a single request 

In [256]:
request = long_requests["002"]
results=search(request, "lemmatization", "bm25") # lemmatization, stemming, basic | classic, bm25, LM, boolean
print("Request : ", request)
print("Number of results : ",len(results))

Request :  Acquisitions Document discusses a currently proposed acquisition involving a U.S.
company and a foreign company.
Number of results :  23424


### Create Run Trec File 

In [262]:
def get_searcher(preprocess_method) :
    directory = FSDirectory.open(File("index_" + preprocess_method + "/").toPath())
    searcher = IndexSearcher(DirectoryReader.open(directory))
    return searcher

def write_run_file(results, file_path):
    with open(file_path, 'w') as f:
        for result in results:
            f.write(" ".join(map(str, result)) + "\n")

# Run the search for each request
def get_results(requests, dir_path, tokenization_method, weight_schemat, length) :
    searcher = get_searcher(tokenization_method)
    for request_id, request in requests.items():
        # Run variables 
        run = []
        trec_run = tokenization_method + "_" + length + "_" + weight_schemat + "_" + request_id + ".txt"
        
        # Run the search
        results = search(request, tokenization_method, weight_schemat)
        for i, scoreDoc in enumerate(results):
            doc = searcher.doc(scoreDoc.doc)
            # QueryId, Q0, DocId, Rank, Score, RunId 
            run.append((request_id, "Q0", doc.get("doc_id"), i+1, scoreDoc.score, trec_run))
        
        # Write the run file
        trec_run_path = dir_path + trec_run
        write_run_file(run, trec_run_path)

Create run files

This will create a run file for each request for each tokenization methode and for each similarity metric.

In [264]:
#6m22s
res_dir_path = "TREC AP 88-90/TREC AP 88-90/trec_run/"
for length in ["short", "long"] :
    for tokenization_method in ["basic", "lemmatization", "stemming"] :
        for weight_schemat in ["classic", "bm25", "LM"] :
            request = short_requests if length == "short" else long_requests
            get_results(request, res_dir_path, tokenization_method , weight_schemat, length)
            # print number of files in the directory
            print("Number of file created  for " + tokenization_method + ", " + weight_schemat + " and " + length + " : ",  len(glob.glob(tokenization_method + "_" + length + "_" + weight_schemat + "_*.txt")))
            print("File created paths like :", tokenization_method + "_" + length + "_" + weight_schemat + "_001.txt")

Number of file created  for basic, classic and short :  0
File created paths like : basic_short_classic_001.txt
Number of file created  for basic, bm25 and short :  0
File created paths like : basic_short_bm25_001.txt
Number of file created  for basic, LM and short :  0
File created paths like : basic_short_LM_001.txt
Number of file created  for lemmatization, classic and short :  0
File created paths like : lemmatization_short_classic_001.txt
Number of file created  for lemmatization, bm25 and short :  0
File created paths like : lemmatization_short_bm25_001.txt
Number of file created  for lemmatization, LM and short :  0
File created paths like : lemmatization_short_LM_001.txt
Number of file created  for stemming, classic and short :  0
File created paths like : stemming_short_classic_001.txt
Number of file created  for stemming, bm25 and short :  0
File created paths like : stemming_short_bm25_001.txt
Number of file created  for stemming, LM and short :  0
File created paths like : 

## Evaluation 

### Get qrel stats

In [172]:
# check for each file of qrel the number of relevant documents
# list of file names
# files = [f'TREC AP 88-90/TREC AP 88-90/trec_qrels/qrel_0{i}.txt' for i in range(1, 150)] 
files = [f'TREC AP 88-90/TREC AP 88-90/trec_qrels/qrel_00{i}.txt' for i in range(1, 2)] 

sum = 0
min = 100000
min_id = ""
max = 0
max_id = ""
relevant_docids = []
for file in files:
    with open(file, 'r') as f:
        # initialize the count of relevant documents
        relevant_count = 0

        for line in f:
            # split the line into columns
            columns = line.split()

            # get the last column
            last_column = columns[-1]

            # if the last column is '1', increment the count of relevant documents
            if last_column == '1':
                relevant_docids.append(columns[2])
                relevant_count += 1
                sum+=1

        if min > relevant_count :
            min = relevant_count
            min_id = file
        if max < relevant_count :
            max = relevant_count
            max_id = file
        print(f'The file {file} contains {relevant_count} relevant documents.')

print("Total number of relevant documents : ", sum)
print("Mean of relevant documents per file : ", sum/len(files) if len(files) > 0 else 1)
print("Min of relevant documents per file : ", min, " in ", min_id)
print("Max of relevant documents per file : ", max, " in ", max_id)

The file TREC AP 88-90/TREC AP 88-90/trec_qrels/qrel_001.txt contains 111 relevant documents.
Total number of relevant documents :  111
Mean of relevant documents per file :  111.0
Min of relevant documents per file :  111  in  TREC AP 88-90/TREC AP 88-90/trec_qrels/qrel_001.txt
Max of relevant documents per file :  111  in  TREC AP 88-90/TREC AP 88-90/trec_qrels/qrel_001.txt


### Run trec eval

In [279]:
from trectools import TrecEval, TrecQrel, TrecRun

def trec_eval(qrel_path, run_path):
    run = TrecRun(run_path)
    qrel= TrecQrel(qrel_path)
    te = TrecEval(run, qrel)

    result = {}
    result["num_ret"] = te.get_retrieved_documents(per_query=False)
    result["num_rel"] = te.get_relevant_documents(per_query=False)
    result["num_rel_ret"] = te.get_relevant_retrieved_documents(per_query=False)
    result["map"] = te.get_map(depth=100, per_query=False, trec_eval=True) 
    for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
        result[f"P@{v}"] = te.get_precision(depth=v, per_query=False, trec_eval=True)

    return result

def format_res(overall_result) :
  map_values = [inner_dict['map'] for inner_dict in overall_result.values()]
  mean_map = sum(map_values) / len(map_values)
  P_10 = [inner_dict['P@10'] for inner_dict in overall_result.values()]
  mean_P_10 = sum(P_10) / len(P_10)
  # 5, 10, 15, 20, 30, 100, 200, 500, 1000
  return { 
    "MMAP" : mean_map, 
    "P@10" :mean_P_10,
    "num_ret": overall_result["1"]["num_ret"], 
    "num_rel": overall_result["1"]["num_rel"],
    "num_rel_ret" : overall_result["1"]["num_rel_ret"],
  }

In [280]:
import glob
qrel_paths = glob.glob('TREC AP 88-90/TREC AP 88-90/trec_qrels/*') 

res_dir_path = "TREC AP 88-90/TREC AP 88-90/trec_run/"
MMAP = 0
count = 0
for length in ["short", "long"] :
    for tokenization_method in ["basic", "lemmatization", "stemming"] :
        for weight_schemat in ["classic", "bm25", "LM"] :
            run_paths = glob.glob(res_dir_path + tokenization_method + "_" + length + "_" + weight_schemat + "_*.txt")
            for i in range(0, len(qrel_paths)) :
                # print("qrel_path : ", qrel_paths[i])
                # print("run_path : ", run_paths[i])
                results = trec_eval(qrel_paths[i], run_paths[i])
                MMAP += results["map"]
                count+=1
                if results["map"] > 0.01 : 
                    print(run_paths[i])
                    print(results)
                if i == 149 :
                    break
print("MMAP : ", MMAP/count)


# overall_result = trec_eval(qrel_paths, treq_run_paths)
# result_formated=format_res(overall_result)
# write_result({"result_formated" : result_formated,"overall_result" : overall_result},"results/" + test.title + ".txt")

{'num_ret': 3, 'num_rel': 23, 'num_rel_ret': 1, 'map': 0.014492753623188404, 'P@5': 0.2, 'P@10': 0.1, 'P@15': 0.06666666666666667, 'P@20': 0.05, 'P@30': 0.03333333333333333, 'P@100': 0.01, 'P@200': 0.005, 'P@500': 0.002, 'P@1000': 0.001}
{'num_ret': 154, 'num_rel': 38, 'num_rel_ret': 20, 'map': 0.11511419529454726, 'P@5': 0.2, 'P@10': 0.2, 'P@15': 0.3333333333333333, 'P@20': 0.3, 'P@30': 0.23333333333333334, 'P@100': 0.18, 'P@200': 0.1, 'P@500': 0.04, 'P@1000': 0.02}
{'num_ret': 17, 'num_rel': 110, 'num_rel_ret': 9, 'map': 0.058568956296229026, 'P@5': 0.6, 'P@10': 0.5, 'P@15': 0.6, 'P@20': 0.45, 'P@30': 0.3, 'P@100': 0.09, 'P@200': 0.045, 'P@500': 0.018, 'P@1000': 0.009}
{'num_ret': 17, 'num_rel': 200, 'num_rel_ret': 10, 'map': 0.03819642857142857, 'P@5': 0.6, 'P@10': 0.7, 'P@15': 0.6, 'P@20': 0.5, 'P@30': 0.3333333333333333, 'P@100': 0.1, 'P@200': 0.05, 'P@500': 0.02, 'P@1000': 0.01}
{'num_ret': 538, 'num_rel': 390, 'num_rel_ret': 58, 'map': 0.012548748052542243, 'P@5': 0.4, 'P@10': 0

KeyboardInterrupt: 

: 

In [278]:
print(count)

2700


# Trashed code 

In [153]:
# list of file names
files = ['TREC AP 88-90/TREC AP 88-90/jugements de pertinence/qrels.1-50.AP8890.txt', 
         'TREC AP 88-90/TREC AP 88-90/jugements de pertinence/qrels.51-100.AP8890.txt',
         'TREC AP 88-90/TREC AP 88-90/jugements de pertinence/qrels.101-150.AP8890.txt']

for file in files:
    with open(file, 'r') as f:
        for line in f:
            # split the line into columns
            columns = line.split()

            # get the first column and format it as a three-digit number
            first_column = format(int(columns[0]), '03d')

            # open the output file and write the line to it
            with open(f'qrel_{first_column}.txt', 'a') as out_file:
                out_file.write(line)