In [2]:
import os
import nltk
import string
import pickle

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

def create_positional_index(dataset_path):
    positional_index = {}
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocess_text(text)
                for position, term in enumerate(tokens):
                    if term not in positional_index:
                        positional_index[term] = {}
                    if filename not in positional_index[term]:
                        positional_index[term][filename] = []
                    positional_index[term][filename].append(position)
    return positional_index

def save_positional_index(positional_index, output_file):
    with open(output_file, 'wb') as file:
        pickle.dump(positional_index, file)

def load_positional_index(pickle_file):
    with open(pickle_file, 'rb') as file:
        positional_index = pickle.load(file)
    return positional_index

dataset_path = "text_files_preprocessed"
output_file = "positional_index.pickle"

positional_index = create_positional_index(dataset_path)

save_positional_index(positional_index, output_file)

positional_index_loaded = load_positional_index(output_file)


In [19]:
def preprocess_input(input_sequence):
    preprocessed_sequence = []
    for query in input_sequence:
        preprocessed_query = preprocess_text(query)
        preprocessed_sequence.append(preprocessed_query)
    return preprocessed_sequence


def evaluate_phrase_query(positional_index, query):
    query_terms = query
    result = {}
    count = 0
    for word in query_terms:
        try:
            current_dict = positional_index[word]
        except KeyError:
            print("No documents found for the term:", word)
            return []

        if not result:
            result = current_dict
        else:
            common_docs = set(result.keys()).intersection(set(current_dict.keys()))
            temp_result = {}
            if not common_docs:
                print("No common documents found for the terms:", query_terms)
                return []
            for doc_id in common_docs:
                temp_result[doc_id] = result[doc_id]
            result = temp_result
            for doc_id in common_docs:
                inc_list = [x + count for x in result[doc_id]]
                temp_pos = set(current_dict[doc_id]).intersection(set(inc_list))
                if not temp_pos:
                    del result[doc_id]
        count += 1

    return list(result.keys())

if __name__ == "__main__":
    positional_index = load_positional_index("positional_index.pickle")

    N = int(input("Enter the number of queries: "))
    queries = []
    for _ in range(N):
        query = input().strip()
        queries.append(query)

    preprocessed_queries = preprocess_input(queries)

    for idx, query in enumerate(queries):
        print(preprocessed_queries[idx])
        result = evaluate_phrase_query(positional_index, preprocessed_queries[idx])
        print(f"Number of documents retrieved for query {idx+1} using positional index: {len(result)}")
        print(f"Names of documents retrieved for query {idx+1} using positional index: " + ", ".join([f"{doc}" for doc in result]))

['great', 'guitar']
Number of documents retrieved for query 1 using positional index: 5
Names of documents retrieved for query 1 using positional index: file2.txt, file277.txt, file271.txt, file235.txt, file131.txt
['ring', 'finger']
Number of documents retrieved for query 2 using positional index: 2
Names of documents retrieved for query 2 using positional index: file994.txt, file70.txt
