# Lab 03 - Ranked Retrieval

## Tokenization

In [2]:
import xml.etree.ElementTree as ET
import re
import math

# Define your inverted index data structure
inverted_index = {}

# Variables to store document lengths and average document length
doc_lengths = {}
avg_doc_length = 0

# Function to tokenize a document
def tokenize(text):
    # Use regular expression to tokenize words
    return re.findall(r'\b\w+\b', text.lower())


## Inverted Index

In [None]:
# Function to build the inverted index
def build_inverted_index(doc_id, text):
    # Tokenize the document
    tokens = tokenize(text)
    
    # Update document length
    doc_lengths[doc_id] = len(tokens)
    
    # Update average document length
    global avg_doc_length
    avg_doc_length = (avg_doc_length * (doc_id - 1) + len(tokens)) / doc_id
    
    # Build the inverted index
    for term in set(tokens):
        if term not in inverted_index:
            inverted_index[term] = []
        inverted_index[term].append((doc_id, tokens.count(term)))


In [1]:

# Function to calculate tf.idf similarity
def tf_idf_similarity(query, doc_id):
    similarity_score = 0

    for term in set(query):
        if term in inverted_index and doc_id in [doc[0] for doc in inverted_index[term]]:
            tf_query = query.count(term)
            tf_doc = next(doc[1] for doc in inverted_index[term] if doc[0] == doc_id)
            idf = math.log(len(doc_lengths) / len(inverted_index[term]))
            normalization = tf_doc + (2 * doc_lengths[doc_id] / avg_doc_length)
            
            similarity_score += (tf_query * tf_doc / normalization) * idf

    return similarity_score

# Function to run queries and return ranked results
def run_queries(queries):
    results = []

    for query_id, query_text in queries.items():
        query_tokens = tokenize(query_text)
        query_results = []

        for doc_id in doc_lengths.keys():
            similarity_score = tf_idf_similarity(query_tokens, doc_id)
            query_results.append((doc_id, similarity_score))

        query_results.sort(key=lambda x: x[1], reverse=True)
        results.extend([(query_id, doc[0], doc[1]) for doc in query_results])

    return results

# Function to write results to a file
def write_results(results, filename):
    with open(filename, 'w') as file:
        for result in results:
            file.write(','.join(map(str, result)) + '\n')

# Function to parse XML file and build the inverted index
def parse_and_build_inverted_index(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    for doc in root.findall('DOC'):
        doc_id = int(doc.findtext('DOCNO'))
        doc_text = doc.findtext('TEXT')
        build_inverted_index(doc_id, doc_text)

# Parse XML file and build the inverted index
parse_and_build_inverted_index('trec.sample.xml')

# Read queries from file
queries = {}
with open('queries.lab3.txt', 'r') as query_file:
    for line in query_file:
        query_id, query_text = line.strip().split(' ', 1)
        queries[int(query_id)] = query_text

# Run queries for different configurations
results_no_stemming_stopping = run_queries(queries)
write_results(results_no_stemming_stopping, 'tfidf.results')

# Implement stemming and stopping as needed

# Write a lab report summarizing your implementation, challenges faced, and results


# END