In [3]:
import numpy as np
from xml.etree import ElementTree as ET
import re
import os
from lxml import etree
from collections import defaultdict

In [86]:
documents_en_filepath = "./A1/documents_en" #Change to your desired folder path of English xml documents
documents_cz_filepath = "./A1/documents_cs" #Change to your desired folder path of Czech xml documents

In [87]:
topics_train_en_filepath = "./A1/topics-train_en.xml" #Change to your desired filepath of English topics train file
topics_train_cz_filepath = "./A1/topics-train_cs.xml" #Change to your desired filepath of Czech topics train file

In [88]:
qrels_train_en_filepath = "./A1/qrels-train_en.txt" #Change to your desired filepath of English queries, docID and relativeness file
qrels_train_cz_filepath = "./A1/qrels-train_cs.txt" #Change to your desired filepath of Czech queries, docID and relativeness file

In [117]:
queries_answers_en_filename = "queries_answers_en.txt" #Change to desired name of output file for English queries answers
queries_answers_cz_filename = "queries_answers_cs.txt" #Change to desired name of output file for Czech queries answers

In [89]:
def tokenize(text):
    return [word for word in re.split(r'\W+', text) if word]

def tokenize1(text):
    return {word for word in re.split(r'\W+', text) if word}

In [90]:
def create_inverted_index_from_files(files_path):
    inverted_index = defaultdict(set)
    files2process = [file for file in os.listdir(files_path) if file.endswith('.xml')]
    doc_ids = set()
    for file_name in files2process:
        xml_file_path = os.path.join(files_path, file_name)
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(xml_file_path, parser)
        root = tree.getroot()
        
        for doc in root.findall('.//DOC'):
            doc_id = doc.find('.//DOCID').text.strip() if doc.find('.//DOCID') is not None else None
            if doc_id:
                doc_ids.add(doc_id)
                    
                tokens = set()
                if(files_path.endswith('en')):
                    for tag in ['HD', 'LD', 'TE']:
                        for elem in doc.findall(f'.//{tag}'):
                            if elem.text:
                                tokens.update(tokenize1(elem.text.strip()))
                elif(files_path.endswith('cs')):
                    for tag in ['TITLE', 'TEXT', 'HEADING']:
                        for elem in doc.findall(f'.//{tag}'):
                            if elem.text:
                                tokens.update(tokenize1(elem.text.strip()))
                
                for token in tokens:
                    inverted_index[token].add(doc_id)

    return dict(inverted_index), doc_ids

In [91]:
inverted_index_en, doc_ids_en = create_inverted_index_from_files(documents_en_filepath)
inverted_index_cz, doc_ids_cz = create_inverted_index_from_files(documents_cz_filepath)

In [92]:
def intersection(set1, set2):
    return set1 & set2

def union(set1, set2):
    return set1 | set2

def complement(universe_set, subset):
    return universe_set - subset

In [93]:
def get_queries(fpath):
    xml_file_path = fpath
    tree = etree.parse(xml_file_path)
    root = tree.getroot()
    queries_dict = defaultdict(set)
    for top in root.findall('.//top'):
        num = top.find('.//num').text.strip()
        query = top.find('.//query').text.strip()
        queries_dict[num] = query

    return queries_dict

In [94]:
xml_file_path_en = topics_train_en_filepath
xml_file_path_cz = topics_train_cz_filepath

In [95]:
queries_dict_en = get_queries(xml_file_path_en)
queries_dict_cz = get_queries(xml_file_path_cz)

In [96]:
queries_dict_en

defaultdict(set,
            {'10.2452/401-AH': 'inflation AND Euro AND NOT football',
             '10.2452/402-AH': 'renewable AND energy AND sources',
             '10.2452/403-AH': '(movie OR role) AND (policeman OR cop)',
             '10.2452/404-AH': 'nato AND summit AND security',
             '10.2452/405-AH': 'children AND asthma',
             '10.2452/406-AH': 'animated AND cartoons',
             '10.2452/407-AH': '(australian OR australian) AND prime AND minister AND 2002',
             '10.2452/408-AH': 'human AND (cloning OR clone)',
             '10.2452/409-AH': 'Bali AND car AND bombing AND 2002',
             '10.2452/410-AH': '>North AND Korea AND nuclear AND weapons AND 1994',
             '10.2452/411-AH': 'oscar AND best AND director AND 2002',
             '10.2452/412-AH': 'politician AND book',
             '10.2452/413-AH': 'diabetes AND risk AND reduction',
             '10.2452/414-AH': '(beer AND festival) OR Oktoberfest',
             '10.2452/415-AH': '

In [97]:
queries_dict_cz

defaultdict(set,
            {'10.2452/401-AH': 'inflace AND Euro AND NOT fotbal',
             '10.2452/402-AH': 'obnovitelný AND zdroj',
             '10.2452/403-AH': '(policista OR policie) AND (film OR role)',
             '10.2452/404-AH': 'summit AND nato AND opatření',
             '10.2452/405-AH': 'astma AND děti AND příčina',
             '10.2452/406-AH': 'animovaný AND film AND ocenění AND festival',
             '10.2452/407-AH': '(australský OR Austrálie) AND premiér AND 2002',
             '10.2452/408-AH': '(lidé OR člověk) AND klonování',
             '10.2452/409-AH': '(bomba OR terorismus) AND Bali AND 2002',
             '10.2452/410-AH': 'severní AND korea AND dohoda AND jaderný AND 1984',
             '10.2452/411-AH': 'Oskar AND cena AND film AND režisér',
             '10.2452/412-AH': '(kniha OR bigrafie OR prospekt) AND politika',
             '10.2452/413-AH': 'snižení AND riziko AND cukrovka',
             '10.2452/414-AH': '(pivní AND festival) OR Oktoberf

In [98]:
class Node:
    def __init__(self, value=None, left=None, right=None):
        self.value = value
        self.left = left
        self.right = right

In [99]:
def parse_query(query, inverted_index, universal_set):
    tokens = re.findall(r'\(|\)|\w+|\S', query)
    return evaluate_expression(tokens, inverted_index, universal_set)

In [100]:
def evaluate_expression(tokens, inverted_index, universal_set):
    if not tokens:
        return None

    stack = []
    while tokens:
        token = tokens.pop(0)
        if token == '(':
            sub_expr = evaluate_expression(tokens, inverted_index, universal_set)
            stack.append(sub_expr)
        elif token == ')':
            break
        elif token.upper() == 'NOT':
            next_token = evaluate_expression(tokens, inverted_index, universal_set)
            result = evaluate_operator(token.upper(), next_token, None, inverted_index, universal_set)
            stack.append(result)
        elif token.upper() in ['AND', 'OR']:
            right = evaluate_expression(tokens, inverted_index, universal_set)
            left = stack.pop() if stack else None
            result = evaluate_operator(token.upper(), left, right, inverted_index, universal_set)
            stack.append(result)
        else:
            if inverted_index.get(token) == None:
                stack.append(set())
            else:
                stack.append(inverted_index.get(token))

    if len(stack) > 1:
        result = stack.pop(0)
        while stack:
            right = stack.pop(0)
            result = result & right
        return result
    else:
        return stack[0]

In [101]:
def evaluate_operator(operator, left, right, inverted_index, universal_set):
    if operator == 'AND':
        return left & right if right is not None else left
    elif operator == 'OR':
        return left | right if right is not None else left
    elif operator == 'NOT':
        return universal_set - left
    return set()

In [102]:
def print_tree(node, level=0):
    if node is not None:
        print_tree(node.right, level + 1)
        print(' ' * 4 * level + '->', node.value)
        print_tree(node.left, level + 1)

In [103]:
def get_relevent_doc_ids(fpath, query_id):
    relevent_doc_ids = set()
    with open(fpath, 'r', encoding='utf-8') as file:
        for line in file:
            line1 = line.strip().split()
            if line1[0] == query_id and line1[3] == "1":
                relevent_doc_ids.add(line1[2])

    return relevent_doc_ids

In [104]:
def get_grund_truth_doc_ids(fpath, query_id):
    relevent_doc_ids = defaultdict(set)
    with open(fpath, 'r', encoding='utf-8') as file:
        for line in file:
            line1 = line.strip().split()
            if line1[0] == query_id:
                if line1[3] == "1":
                    relevent_doc_ids["1"].add(line1[2])
                if line1[3] == "0":
                    relevent_doc_ids["0"].add(line1[2])

    return relevent_doc_ids

In [105]:
def get_groundTruth_queryAns(train_path, inverted_index, queries_dict, doc_ids):
    queries_answers = defaultdict(set)
    ground_truth = defaultdict(dict)
    for query_id, query in queries_dict.items():
        result = parse_query(query, inverted_index, doc_ids)
        ground_truth[query_id] = get_grund_truth_doc_ids(train_path, query_id)
        queries_answers[query_id] = result
    return ground_truth, queries_answers

In [106]:
ground_truth_en, queries_answers_en = get_groundTruth_queryAns(qrels_train_en_filepath, inverted_index_en, queries_dict_en, doc_ids_en)
ground_truth_cz, queries_answers_cz = get_groundTruth_queryAns(qrels_train_cz_filepath, inverted_index_cz, queries_dict_cz, doc_ids_cz)

In [107]:
def calculate_precision_recall(ground_truth, queries_answers):
    results = {}
    
    for query_id, answers in queries_answers.items():
        true_positives = len(answers & ground_truth[query_id]["1"])
        false_positives = len(answers - ground_truth[query_id]["1"])
        false_negatives = len(ground_truth[query_id]["1"] - answers)
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        results[query_id] = {"precision": precision, "recall": recall}
        
    return results

In [108]:
results_en = calculate_precision_recall(ground_truth_en, queries_answers_en)
results_cz = calculate_precision_recall(ground_truth_cz, queries_answers_cz)

In [109]:
def calculate_average_precision_recall(results):
    total_precision = 0
    total_recall = 0
    num_queries = len(results)

    for query_id in results:
        total_precision += results[query_id]['precision']
        total_recall += results[query_id]['recall']

    average_precision = total_precision / num_queries if num_queries > 0 else 0
    average_recall = total_recall / num_queries if num_queries > 0 else 0

    return average_precision, average_recall


In [110]:
average_precision_en, average_recall_en = calculate_average_precision_recall(results_en)
average_precision_cz, average_recall_cz = calculate_average_precision_recall(results_cz)

In [111]:
print(f"Average Precision ENG: {average_precision_en}")
print(f"Average Recall ENG: {average_recall_en}")
print(f"Average Precision CZE: {average_precision_cz}")
print(f"Average Recall CZE: {average_recall_cz}")

Average Precision ENG: 0.1698188391235194
Average Recall ENG: 0.3136040268294562
Average Precision CZE: 0.11657206716623145
Average Recall CZE: 0.10944759141567652


In [115]:
def write_query_doc_pairs_to_file(queries_answers, filename):
    with open(filename, 'w') as file:
        for query_id, docs in queries_answers.items():
            for doc_id in docs:
                file.write(f"{query_id} {doc_id}\n")

In [119]:
write_query_doc_pairs_to_file(queries_answers_en, queries_answers_en_filename)
write_query_doc_pairs_to_file(queries_answers_cz, queries_answers_cz_filename)