In [None]:
import os
import nltk
import string
import pickle

nltk.download('punkt')
nltk.download('stopwords')

In [9]:
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

def create_positional_index(dataset_path):
    positional_index = {}
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocess_text(text)
                for position, term in enumerate(tokens):
                    if term not in positional_index:
                        positional_index[term] = {}
                    if filename not in positional_index[term]:
                        positional_index[term][filename] = []
                    positional_index[term][filename].append(position)
    return positional_index

def save_positional_index(positional_index, output_file):
    with open(output_file, 'wb') as file:
        pickle.dump(positional_index, file)

def load_positional_index(pickle_file):
    with open(pickle_file, 'rb') as file:
        positional_index = pickle.load(file)
    return positional_index

dataset_path = "text_files_preprocessed"
output_file = "positional_index.pickle"

positional_index = create_positional_index(dataset_path)

save_positional_index(positional_index, output_file)

positional_index_loaded = load_positional_index(output_file)


In [23]:
def preprocess_input(input_sequence):
    preprocessed_sequence = []
    for query in input_sequence:
        preprocessed_query = preprocess_text(query)
        preprocessed_sequence.append(preprocessed_query)
    return preprocessed_sequence

def evaluate_phrase_query(positional_index, query):
    query_terms = query
    documents_matching = []
    for term_idx, term in enumerate(query_terms):
        if term not in positional_index:
            return []
        term_postings = positional_index[term]
        if term_idx == 0:
            documents_matching = list(term_postings.keys())
            print(documents_matching)
        else:
            documents_matching = [doc for doc in documents_matching if doc in term_postings.keys()]
        
        # for document in documents_matching:
        #     for position in term_postings[document]:
        #         if position + term_idx + 1 in term_postings[document]:
        #             documents_matching.append(document)
        #             break
    return list(set(documents_matching))

if __name__ == "__main__":
    positional_index = load_positional_index("positional_index.pickle")

    N = int(input("Enter the number of queries: "))
    queries = []
    for _ in range(N):
        query = input().strip()
        queries.append(query)

    preprocessed_queries = preprocess_input(queries)

    for idx, query in enumerate(queries):
        print(preprocessed_queries[idx])
        result = evaluate_phrase_query(positional_index, preprocessed_queries[idx])
        print(f"Number of documents retrieved for query {idx+1} using positional index: {len(result)}")
        print(f"Names of documents retrieved for query {idx+1} using positional index: " + ", ".join([f"{doc}" for doc in result]))

['good', 'tension']
['file1.txt', 'file103.txt', 'file106.txt', 'file110.txt', 'file111.txt', 'file115.txt', 'file118.txt', 'file13.txt', 'file137.txt', 'file141.txt', 'file143.txt', 'file154.txt', 'file155.txt', 'file157.txt', 'file159.txt', 'file16.txt', 'file160.txt', 'file162.txt', 'file163.txt', 'file164.txt', 'file166.txt', 'file172.txt', 'file174.txt', 'file175.txt', 'file176.txt', 'file179.txt', 'file18.txt', 'file189.txt', 'file19.txt', 'file2.txt', 'file204.txt', 'file207.txt', 'file210.txt', 'file217.txt', 'file220.txt', 'file234.txt', 'file235.txt', 'file240.txt', 'file245.txt', 'file252.txt', 'file254.txt', 'file265.txt', 'file274.txt', 'file277.txt', 'file28.txt', 'file282.txt', 'file288.txt', 'file29.txt', 'file292.txt', 'file293.txt', 'file299.txt', 'file30.txt', 'file304.txt', 'file305.txt', 'file311.txt', 'file316.txt', 'file321.txt', 'file325.txt', 'file332.txt', 'file338.txt', 'file342.txt', 'file347.txt', 'file354.txt', 'file355.txt', 'file358.txt', 'file362.txt', 

['good', 'tension']
No documents found for the term: good
Number of documents retrieved for query 1 using positional index: 0
Names of documents retrieved for query 1 using positional index: 
