### Q1. **Data** *Preprocessing*

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import string
import random

In [None]:
def download_nltk_resources():

    nltk.download('punkt')
    nltk.download('stopwords')

def get_data_path():

    data_path = "/content/drive/MyDrive/text_files"
    return data_path

In [None]:
def preprocess(input_text, display_steps=True):
    # Step 1: Remove HTML tags
    soup_obj = BeautifulSoup(input_text, 'html.parser')
    text = soup_obj.get_text()
    if display_steps:
        print(f"Step 1: Remove HTML tags\n{text}\n")

    # Step 2: Lowercase the text
    text = text.lower()
    if display_steps:
        print(f"Step 2: Lowercase the text\n{text}\n")

    # Step 3: Tokenization
    tokens = word_tokenize(text)
    if display_steps:
        print(f"Step 3: Tokenization\n{tokens}\n")

    # Step 4: Remove stopwords
    stop_words_set = set(stopwords.words('english'))
    stop_words_tokens = [token for token in tokens if token.lower() not in stop_words_set]
    if display_steps:
        print(f"Step 4: Remove stopwords\n{stop_words_tokens}\n")

    # Step 5: Remove punctuations
    translator = str.maketrans('', '', string.punctuation)
    tokens_without_punctuation = [token.translate(translator) for token in stop_words_tokens]
    if display_steps:
        print(f"Step 5: Remove punctuations\n{tokens_without_punctuation}\n")


    filtered_tokens_without_blank_space = [token for token in tokens_without_punctuation if token.strip()]
    preprocessed_text = ' '.join(filtered_tokens_without_blank_space)
    if display_steps:
        print(f"Step 6: Remove blank space token\n{preprocessed_text}\n")

    return preprocessed_text

In [48]:
download_nltk_resources()
data_path = get_data_path()


preprocessed_data_path = os.path.join(data_path, "preprocessed_files")
os.makedirs(preprocessed_data_path, exist_ok=True)

text_files = [filename for filename in os.listdir(data_path) if filename.endswith('.txt')]
random_sample_files = random.sample(text_files, 5)

for filename in text_files:
    file_path = os.path.join(data_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        original_text = file.read()

    if filename in random_sample_files:
        print(f"\nSample - File: {filename}\n")
        preprocess(original_text)
    else:
        preprocessed_text = preprocess(original_text, display_steps=False)

    new_filename = "preprocessed_" + filename
    new_file_path = os.path.join(preprocessed_data_path, new_filename)
    with open(new_file_path, 'w', encoding='utf-8') as new_file:
        new_file.write(preprocessed_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup_obj = BeautifulSoup(input_text, 'html.parser')



Sample - File: file109.txt

Step 1: Remove HTML tags
Odyssey 8-Space Rack is great. I put casters on mine to move it around when I need to.

Step 2: Lowercase the text
odyssey 8-space rack is great. i put casters on mine to move it around when i need to.

Step 3: Tokenization
['odyssey', '8-space', 'rack', 'is', 'great', '.', 'i', 'put', 'casters', 'on', 'mine', 'to', 'move', 'it', 'around', 'when', 'i', 'need', 'to', '.']

Step 4: Remove stopwords
['odyssey', '8-space', 'rack', 'great', '.', 'put', 'casters', 'mine', 'move', 'around', 'need', '.']

Step 5: Remove punctuations
['odyssey', '8space', 'rack', 'great', '', 'put', 'casters', 'mine', 'move', 'around', 'need', '']

Step 6: Remove blank space token
odyssey 8space rack great put casters mine move around need


Sample - File: file458.txt

Step 1: Remove HTML tags
I have nothing bad to say about this stand. It works and it gets the job done. I use it at home,  and I have placed it in a corner of the room with 3 electric guitars,

## **Q2. Unigram Inverted Index and Phrase Queries**



In [None]:
import os
import re
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import string

In [None]:
def download_nltk_resources():
    nltk.download('punkt')
    nltk.download('stopwords')

def get_data_path():
    return "/content/drive/MyDrive/Dataset3"

def clean_html(text):
    soup_obj = BeautifulSoup(text, 'html.parser')
    return soup_obj.get_text()

def preprocess_text(text):
    text = clean_html(text)
    text = text.lower()
    tokens = re.split(r'[;,\s]+', text)
    stop_words_set = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words_set and token not in string.punctuation]
    return tokens

def create_inverted_index(data_path):
    inverted_index = {}
    for filename in os.listdir(data_path):
        file_path = os.path.join(data_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            tokens = preprocess_text(content)
            for token in set(tokens):
                inverted_index.setdefault(token, []).append(filename)
    return inverted_index

In [None]:
def save_inverted_index_as_text(inverted_index, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for term, postings in inverted_index.items():
            file.write(f"{term}: {', '.join(postings)}\n")

def load_inverted_index(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def delete_pickles():
    try:
        os.remove('unigramIndex')
        os.remove('DocumentID')
        print("Pickles deleted successfully.")
    except FileNotFoundError:
        print("Pickles do not exist.")

In [None]:
def apply_and(result_set, next_file_set):
    return result_set.intersection(next_file_set)

def apply_or(result_set, next_file_set):
    return result_set.union(next_file_set)

def apply_and_not(result_set, next_file_set):
    return result_set.difference(next_file_set)

def apply_or_not(result_set, next_file_set, inverted_values_set):
    return result_set.union(inverted_values_set.difference(next_file_set))

def apply_operation(result_set, next_file_set, operation):
    operations_dict = {
        'AND': apply_and,
        'OR': apply_or,
        'AND NOT': apply_and_not,
        'OR NOT': apply_or_not
    }
    return operations_dict[operation](result_set, next_file_set)

In [52]:
def process_query(query, inverted_index, ops):
    input_seq = preprocess_text(query)
    file_sets = [set(inverted_index.get(term, [])) for term in input_seq]

    result_set = file_sets[0]
    for index, op in enumerate(ops.split(',')):
        next_file_set = file_sets[index + 1] if index + 1 < len(file_sets) else set()
        result_set = apply_operation(result_set, next_file_set, op)

    return result_set

if __name__ == "__main__":
    download_nltk_resources()
    data_path = get_data_path()

    inverted_index = create_inverted_index(data_path)

    with open('inverted_index.pkl', 'wb') as f:
        pickle.dump(inverted_index, f)

    print("Inverted index created and saved.")

    inverted_values_set = set().union(*inverted_index.values())
    inverted_index = load_inverted_index('inverted_index.pkl')

    N = int(input("Enter the number of queries: "))
    queries = []
    query_count = 0
    while query_count < N:
        query_input = input("Enter query: ")
        operator_input = input("Enter operators (comma-separated): ")
        queries.append((query_input, operator_input))
        query_count += 1

    for i, (query, ops) in enumerate(queries, start=1):
        preprocessed_query = preprocess_text(query)
        result_set = process_query(query, inverted_index, ops)
        if result_set:
            print(f"Query {i}: {query}")
            print("Preprocessed Query:", preprocessed_query)
            print("Number of documents retrieved:", len(result_set))
            print("Names of the documents retrieved:", ', '.join(result_set))
        else:
            print(f"No documents retrieved for query {i}.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Inverted index created and saved.
Enter the number of queries: 1
Enter query: rock band;
Enter operators (comma-separated): OR
Query 1: rock band;
Preprocessed Query: ['rock', 'band']
Number of documents retrieved: 35
Names of the documents retrieved: preprocessed_file154.txt, preprocessed_file184.txt, preprocessed_file411.txt, preprocessed_file727.txt, preprocessed_file324.txt, preprocessed_file276.txt, preprocessed_file979.txt, preprocessed_file712.txt, preprocessed_file961.txt, preprocessed_file883.txt, preprocessed_file228.txt, preprocessed_file987.txt, preprocessed_file977.txt, preprocessed_file901.txt, preprocessed_file968.txt, preprocessed_file959.txt, preprocessed_file460.txt, preprocessed_file381.txt, preprocessed_file781.txt, preprocessed_file194.txt, preprocessed_file400.txt, preprocessed_file187.txt, preprocessed_file24.txt, preprocessed_file230.txt, preprocessed_file29.txt, preprocessed_file55.txt, preprocessed_file342.txt, preprocessed_file844.txt, preprocessed_file850.tx


### Q3. Positional Index and Phrase Queries

In [None]:
import os
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from collections import defaultdict
import string

In [None]:
def download_nltk_resources():
    nltk.download('punkt')
    nltk.download('stopwords')

def get_data_path():
    return "/content/drive/MyDrive/Dataset3"

In [None]:
def preprocess_text(text):
    soup_obj = BeautifulSoup(text, 'html.parser')
    text = soup_obj.get_text().lower()
    tokens = word_tokenize(text)
    stop_words_set = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words_set - set(string.punctuation)]
    return ' '.join(tokens)

def preprocess_query(query):
    query = query.lower()
    tokens = word_tokenize(query)
    stop_words_set = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words_set]
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token.strip()]
    return tokens

In [None]:
def build_positional_index(documents, fnms):
    positional_index = {token: {filename: [] for filename in fnms} for token in set(word_tokenize(' '.join(documents)))}
    for doc_id, document in enumerate(documents):
        tokens = word_tokenize(document)
        for position, token in enumerate(tokens, start=1):
            positional_index[token][fnms[doc_id]].append(position)
    return positional_index

In [None]:
def save_positional_index(positional_index,fnm):
    with open(fnm,'wb') as file:
        pickle.dump(positional_index, file)

def load_positional_index(fnm):
    with open(fnm,'rb') as file:
        return pickle.load(file)

In [None]:
def retrieve_documents_for_query(query_tokens, positional_index, fnms):
    retrieved_documents = set()
    for doc_filename in fnms:
        doc_positions = [positional_index.get(token, {}).get(doc_filename, []) for token in query_tokens]
        if all(doc_pos for doc_pos in doc_positions):
            for pos in doc_positions[0]:
                if all(pos + i in doc_pos for i, doc_pos in enumerate(doc_positions[1:], start=1)):
                    retrieved_documents.add(doc_filename)
    return retrieved_documents

In [17]:
def main():
    download_nltk_resources()
    data_path = get_data_path()
    tfdf = [filename for filename in os.listdir(data_path) if filename.endswith('.txt')]
    fnms = []

    preprocessed_documents = []
    i = 0
    while i < len(tfdf):
        fnm = tfdf[i]
        file_path = os.path.join(data_path, fnm)
        fnms.append(fnm)
        with open(file_path, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()
            preprocessed_documents.append(preprocess_text(preprocessed_text))
        i += 1

    positional_index = build_positional_index(preprocessed_documents, fnms)
    save_positional_index(positional_index, "positional_index.pkl")

    loaded_positional_index = load_positional_index("positional_index.pkl")

    NoQuery= int(input("Enter number of queries: "))
    queries= [input("Enter query: ") for _ in range(NoQuery)]

    for i, query in enumerate(queries,start=1):

        query_tokens = preprocess_query(query)
        retrieved_documents = retrieve_documents_for_query(query_tokens, loaded_positional_index, tfdf)
        print(f"Number of documents retrieved for query {i} using positional index: {len(retrieved_documents)}")
        print(f"Names of documents retrieved for query {i} using positional index: {', '.join(retrieved_documents)}")
        print()

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter number of queries: 3
Enter query: Rock;,baNd
Enter query: eyes
Enter query: bob
Number of documents retrieved for query 1 using positional index: 2
Names of documents retrieved for query 1 using positional index: preprocessed_file154.txt, preprocessed_file24.txt

Number of documents retrieved for query 2 using positional index: 5
Names of documents retrieved for query 2 using positional index: preprocessed_file923.txt, preprocessed_file847.txt, preprocessed_file626.txt, preprocessed_file213.txt, preprocessed_file758.txt

Number of documents retrieved for query 3 using positional index: 0
Names of documents retrieved for query 3 using positional index: 

