In [7]:
import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    # Parse the XML string
    root = ET.fromstring(xml_string)

    # Extract information from the XML elements
    document_id = root.findtext('document_id')
    profile = root.findtext('profile')
    date = root.findtext('date')
    headline = root.findtext('headline')
    text = root.findtext('text')

    # You can print or process the extracted information as needed
    print("Document ID:", document_id)
    print("Profile:", profile)
    print("Date:", date)
    print("Headline:", headline)
    print("Text:", text)

# Example XML string
sample_xml = """
<document>
    <document_id>1</document_id>
    <profile>Profile 1</profile>
    <date>2023-10-12</date>
    <headline>Sample Headline</headline>
    <text>This is the sample text of the document.</text>
</document>

"""

# Call the parse_xml function with the sample XML string
parse_xml(sample_xml)


Document ID: 1
Profile: Profile 1
Date: 2023-10-12
Headline: Sample Headline
Text: This is the sample text of the document.


## Parsing XML Documents

In [46]:
import xml.etree.ElementTree as ET

def parse_xml_file(file):
    # Parse the XML file
    tree = ET.parse(file)
    root = tree.getroot()

    # Iterate through each <DOC> element in the XML file
    for doc in root.findall('DOC'):
        # Extract information from the XML elements
        docno = doc.findtext('DOCNO')
        text = doc.findtext('Text')

        # You can print or process the extracted information as needed
        print("DOCNO:", docno)
        print("Text:", text)
        print("\n")


# Open the XML file
file_path = 'sample.xml'

with open(file_path, 'r') as file:
    # Call the parse_xml_file function with the file object
    parse_xml_file(file)


DOCNO: 1
Text: 
		He likes to wink, he likes to drink
	


DOCNO: 2
Text: 
		He likes to drink, and drink, and drink
	


DOCNO: 3
Text: 
		The thing he likes to drink is ink
	


DOCNO: 4
Text: 
		The ink he likes to drink is pink
	


DOCNO: 5
Text: 
		He likes to wink, and drink pink ink
	




## Tokenization and Preprocessing

In [50]:
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def tokenize_and_preprocess(text, use_stopping=True, use_stemming=True):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stopwords if stopping is enabled
    if use_stopping:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]

    # Apply stemming if stemming is enabled
    if use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

def parse_xml_file(file, use_stopping=True, use_stemming=True):
    try:
        root = ET.fromstring(file.read())

        for document in root.findall('DOC'):
            headline = document.findtext('Text')
            text = document.findtext('Text')

            # Tokenize and preprocess the headline and text
            headline_tokens = tokenize_and_preprocess(headline, use_stopping, use_stemming)
            text_tokens = tokenize_and_preprocess(text, use_stopping, use_stemming)

            # You can print or process the tokens as needed
            print("Headline Tokens:", headline_tokens)
            print("Text Tokens:", text_tokens)
            print("\n")

    except Exception as e:
        print(f"Error parsing XML content: {e}")


# Open the XML file
file_path = 'sample.xml'

with open(file_path, 'r') as file:
    # Call the parse_xml_file function with the file object
    parse_xml_file(file)


Headline Tokens: ['like', 'wink', ',', 'like', 'drink']
Text Tokens: ['like', 'wink', ',', 'like', 'drink']


Headline Tokens: ['like', 'drink', ',', 'drink', ',', 'drink']
Text Tokens: ['like', 'drink', ',', 'drink', ',', 'drink']


Headline Tokens: ['thing', 'like', 'drink', 'ink']
Text Tokens: ['thing', 'like', 'drink', 'ink']


Headline Tokens: ['ink', 'like', 'drink', 'pink']
Text Tokens: ['ink', 'like', 'drink', 'pink']


Headline Tokens: ['like', 'wink', ',', 'drink', 'pink', 'ink']
Text Tokens: ['like', 'wink', ',', 'drink', 'pink', 'ink']




## Inverted Index

In [54]:
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict

def tokenize_and_preprocess(text, use_stopping=True, use_stemming=True):
    tokens = word_tokenize(text.lower())
    
    if use_stopping:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]

    if use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

def build_inverted_index(file, use_stopping=True, use_stemming=True):
    inverted_index = defaultdict(list)

    try:
        tree = ET.fromstring(file.read())
        root = tree

        for doc in root.findall('DOC'):
            document_id = doc.findtext('DOCNO')
            headline = doc.findtext('Text')
            text = doc.findtext('Text')

            # Tokenize and preprocess the headline and text
            headline_tokens = tokenize_and_preprocess(headline, use_stopping, use_stemming)
            text_tokens = tokenize_and_preprocess(text, use_stopping, use_stemming)

            # Update the inverted index with the document_id and positions
            for position, token in enumerate(headline_tokens + text_tokens):
                inverted_index[token].append((document_id, position))

    except Exception as e:
        print(f"Error building inverted index: {e}")

    return inverted_index


# Specify the correct path to your sample.xml file
file_path = "sample.xml"

with open(file_path, 'r') as file:
    # Build the inverted index
    inverted_index = build_inverted_index(file)


# Print the inverted index for visualization
for term, postings in inverted_index.items():
    print(f"{term}: {postings}")


like: [('1', 0), ('1', 3), ('1', 5), ('1', 8), ('2', 0), ('2', 6), ('3', 1), ('3', 5), ('4', 1), ('4', 5), ('5', 0), ('5', 6)]
wink: [('1', 1), ('1', 6), ('5', 1), ('5', 7)]
,: [('1', 2), ('1', 7), ('2', 2), ('2', 4), ('2', 8), ('2', 10), ('5', 2), ('5', 8)]
drink: [('1', 4), ('1', 9), ('2', 1), ('2', 3), ('2', 5), ('2', 7), ('2', 9), ('2', 11), ('3', 2), ('3', 6), ('4', 2), ('4', 6), ('5', 3), ('5', 9)]
thing: [('3', 0), ('3', 4)]
ink: [('3', 3), ('3', 7), ('4', 0), ('4', 4), ('5', 5), ('5', 11)]
pink: [('4', 3), ('4', 7), ('5', 4), ('5', 10)]


## Output Inverted Index

In [56]:
def output_inverted_index(inverted_index, output_file_path):
    try:
        with open(output_file_path, 'w') as output_file:
            for term, postings in sorted(inverted_index.items()):
                posting_strings = [f"{doc_id}:{pos}" for doc_id, pos in postings]
                output_line = f"{term}: {', '.join(posting_strings)}\n"
                output_file.write(output_line)

        print(f"Inverted index has been written to {output_file_path}")

    except Exception as e:
        print(f"Error writing inverted index to file: {e}")


# Specify the output file path
output_file_path = "trec.index.txt"


# Output the inverted index to a text file
output_inverted_index(inverted_index, output_file_path)

Inverted index has been written to trec.index.txt


## Loading Inverted Index

In [60]:
def load_inverted_index(input_file_path):
    inverted_index = defaultdict(list)

    try:
        with open(input_file_path, 'r') as input_file:
            for line in input_file:
                term, postings_str = line.strip().split(': ')
                postings = [tuple(map(int, posting.split(':'))) for posting in postings_str.split(', ')]
                inverted_index[term] = postings

        print(f"Inverted index has been loaded from {input_file_path}")

    except Exception as e:
        print(f"Error loading inverted index from file: {e}")

    return inverted_index


# Specify the input file path
input_file_path = "trec.index.txt"


# Load the inverted index from the text file
loaded_inverted_index = load_inverted_index(input_file_path)


# Print Inverted Index
print(inverted_index)

Inverted index has been loaded from trec.index.txt
defaultdict(<class 'list'>, {'like': [('1', 0), ('1', 3), ('1', 5), ('1', 8), ('2', 0), ('2', 6), ('3', 1), ('3', 5), ('4', 1), ('4', 5), ('5', 0), ('5', 6)], 'wink': [('1', 1), ('1', 6), ('5', 1), ('5', 7)], ',': [('1', 2), ('1', 7), ('2', 2), ('2', 4), ('2', 8), ('2', 10), ('5', 2), ('5', 8)], 'drink': [('1', 4), ('1', 9), ('2', 1), ('2', 3), ('2', 5), ('2', 7), ('2', 9), ('2', 11), ('3', 2), ('3', 6), ('4', 2), ('4', 6), ('5', 3), ('5', 9)], 'thing': [('3', 0), ('3', 4)], 'ink': [('3', 3), ('3', 7), ('4', 0), ('4', 4), ('5', 5), ('5', 11)], 'pink': [('4', 3), ('4', 7), ('5', 4), ('5', 10)]})


## Word Overlap Retrieval

In [63]:
def word_overlap_similarity(query_tokens, document_tokens):
    intersection = set(query_tokens) & set(document_tokens)
    union = set(query_tokens) | set(document_tokens)

    similarity = len(intersection) / len(union) if len(union) > 0 else 0
    return similarity

def word_overlap_retrieval(query, inverted_index):
    query_tokens = tokenize_and_preprocess(query)

    # Calculate similarity for each document in the inverted index
    document_similarities = {}
    for term in query_tokens:
        if term in inverted_index:
            for document_id, _ in inverted_index[term]:
                if document_id not in document_similarities:
                    document_similarities[document_id] = 0
                document_similarities[document_id] += 1

    # Normalize the similarity scores by the length of the query
    for document_id in document_similarities:
        document_similarities[document_id] /= len(query_tokens)

    # Sort documents by similarity score in descending order
    ranked_documents = sorted(document_similarities.items(), key=lambda x: x[1], reverse=True)

    return ranked_documents


# Example query
query = "wink"


# Perform word overlap retrieval
ranked_documents = word_overlap_retrieval(query, loaded_inverted_index)


# Print the ranked documents
print("Ranked Documents:")
for document_id, similarity in ranked_documents:
    print(f"Document ID: {document_id}, Similarity: {similarity}")


Ranked Documents:
Document ID: 1, Similarity: 2.0
Document ID: 5, Similarity: 2.0


## Boolean Search

In [73]:
def boolean_search(query, inverted_index):
    # Tokenize and preprocess the query
    query_tokens = tokenize_and_preprocess(query)

    # Handle AND, OR, and NOT operations
    result_documents = set()

    i = 0
    while i < len(query_tokens):
        term = query_tokens[i]

        if term == "AND":
            i += 1
            next_term = query_tokens[i]
            if next_term in inverted_index:
                result_documents.intersection_update(set(doc_id for doc_id, _ in inverted_index[next_term]))
            else:
                result_documents.clear()

        elif term == "OR":
            i += 1
            next_term = query_tokens[i]
            if next_term in inverted_index:
                result_documents.update(set(doc_id for doc_id, _ in inverted_index[next_term]))

        elif term == "NOT":
            i += 1
            next_term = query_tokens[i]
            if next_term in inverted_index:
                result_documents.difference_update(set(doc_id for doc_id, _ in inverted_index[next_term]))

        else:
            if term in inverted_index:
                result_documents.update(set(doc_id for doc_id, _ in inverted_index[term]))
            else:
                result_documents.clear()

        i += 1

    # Convert the result to a list for better display
    result_documents = list(result_documents)

    return result_documents


# Example Boolean query
boolean_query = "drink AND wink OR think NOT like"


# Perform Boolean search
boolean_result = boolean_search(boolean_query, loaded_inverted_index)


# Print the result documents
print("Result Documents:", boolean_result)


Result Documents: [1, 2, 3, 4, 5]


## Comparison and Analysis

In [74]:
def compare_and_analyze(query, inverted_index, use_stopping=True, use_stemming=True):
    # Word Overlap Retrieval
    print("Word Overlap Retrieval:")
    word_overlap_result = word_overlap_retrieval(query, inverted_index)
    for document_id, similarity in word_overlap_result:
        print(f"Document ID: {document_id}, Similarity: {similarity}")
    print("\n")

    # Boolean Search
    print("Boolean Search:")
    boolean_result = boolean_search(query, inverted_index)
    print("Result Documents:", boolean_result)
    print("\n")

    # Analysis
    print("Analysis:")
    print(f"Use Stopping: {use_stopping}, Use Stemming: {use_stemming}")
    print(f"Word Overlap Result Documents: {len(word_overlap_result)}")
    print(f"Boolean Search Result Documents: {len(boolean_result)}")
    print("\n")

# Example query
query_to_compare = "information retrieval"

# Perform comparison and analysis
compare_and_analyze(query_to_compare, loaded_inverted_index)


Word Overlap Retrieval:


Boolean Search:
Result Documents: []


Analysis:
Use Stopping: True, Use Stemming: True
Word Overlap Result Documents: 0
Boolean Search Result Documents: 0




# END