In [4]:
import os
import nltk
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

def create_inverted_index(dataset_path):
    inverted_index = {}
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocess_text(text)
                for position, term in enumerate(tokens):
                    if term not in inverted_index:
                        inverted_index[term] = []
                    if filename not in inverted_index[term]:
                        inverted_index[term].append(filename)
    return inverted_index

def save_inverted_index(inverted_index, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for term, documents in inverted_index.items():
            file.write(term + ":" + ",".join(documents) + "\n")

dataset_path = "text_files_preprocessed"
output_file = "inverted_index.txt"

inverted_index = create_inverted_index(dataset_path)

save_inverted_index(inverted_index, output_file)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import pickle

def save_inverted_index_as_pickle(inverted_index, output_file):
    with open(output_file, 'wb') as file:
        pickle.dump(inverted_index, file)

save_inverted_index_as_pickle(inverted_index, 'inverted_index.pickle')

In [6]:
import pickle

def load_inverted_index(pickle_file):
    with open(pickle_file, 'rb') as file:
        inverted_index = pickle.load(file)
    return inverted_index

def and_operation(inverted_index, term1, term2):
    if term1 not in inverted_index or term2 not in inverted_index:
        return []
    return list(set(inverted_index[term1]) & set(inverted_index[term2]))

def or_operation(inverted_index, term1, term2):
    documents = set()
    if term1 in inverted_index:
        documents.update(inverted_index[term1])
    if term2 in inverted_index:
        documents.update(inverted_index[term2])
    return list(documents)

def and_not_operation(inverted_index, term1, term2):
    if term1 not in inverted_index or term2 not in inverted_index:
        return []
    return list(set(inverted_index[term1]) - set(inverted_index[term2]))

def or_not_operation(inverted_index, term1, term2):
    documents = set(inverted_index.get(term1, []))
    excluded_documents = set(inverted_index.get(term2, []))
    return list(documents - excluded_documents)

if __name__ == "__main__":
    inverted_index = load_inverted_index("inverted_index.pickle")

    term1 = "great"
    term2 = "stability"

    # T1 AND T2
    print("T1 AND T2:", and_operation(inverted_index, term1, term2))

    # T1 OR T2
    print("T1 OR T2:", or_operation(inverted_index, term1, term2))

    # T1 AND NOT T2
    print("T1 AND NOT T2:", and_not_operation(inverted_index, term1, term2))

    # T1 OR NOT T2
    print("T1 OR NOT T2:", or_not_operation(inverted_index, term1, term2))

T1 AND T2: ['file382.txt', 'file1.txt', 'file115.txt']
T1 OR T2: ['file62.txt', 'file999.txt', 'file152.txt', 'file433.txt', 'file939.txt', 'file553.txt', 'file810.txt', 'file78.txt', 'file657.txt', 'file696.txt', 'file759.txt', 'file251.txt', 'file439.txt', 'file854.txt', 'file861.txt', 'file934.txt', 'file835.txt', 'file593.txt', 'file273.txt', 'file806.txt', 'file4.txt', 'file573.txt', 'file96.txt', 'file28.txt', 'file148.txt', 'file476.txt', 'file811.txt', 'file467.txt', 'file26.txt', 'file99.txt', 'file699.txt', 'file91.txt', 'file428.txt', 'file879.txt', 'file457.txt', 'file183.txt', 'file330.txt', 'file466.txt', 'file13.txt', 'file857.txt', 'file491.txt', 'file238.txt', 'file435.txt', 'file684.txt', 'file64.txt', 'file740.txt', 'file546.txt', 'file101.txt', 'file516.txt', 'file618.txt', 'file465.txt', 'file444.txt', 'file518.txt', 'file639.txt', 'file108.txt', 'file539.txt', 'file781.txt', 'file708.txt', 'file704.txt', 'file611.txt', 'file956.txt', 'file988.txt', 'file882.txt', 

In [7]:
import os
import nltk
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def get_all_documents(directory):
    documents = set()
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            documents.add(filename)
    return documents

directory = "text_files_preprocessed"
all_documents = get_all_documents(directory)

In [9]:
def not_operation(inverted_index, documents, term):
    if term in inverted_index:
        excluded_documents = set(inverted_index[term])
    else:
        excluded_documents = set()
    return list(set(documents) - excluded_documents)

def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

def load_inverted_index(pickle_file):
    with open(pickle_file, 'rb') as file:
        inverted_index = pickle.load(file)
    return inverted_index

def and_operation(inverted_index, documents, term):
    if term not in inverted_index:
        return []
    return list(set(documents) & set(inverted_index[term]))

def or_operation(inverted_index, documents, term):
    if term in inverted_index:
        documents.update(inverted_index[term])
    return list(documents)

def and_not_operation(inverted_index, documents, term):
    if term not in inverted_index:
        return documents
    return list(set(documents) - set(inverted_index[term]))

def or_not_operation(inverted_index, documents, term):
    if term in inverted_index:
        excluded_documents = set(inverted_index[term])
    else:
        excluded_documents = set()
    return list(set(documents) - excluded_documents)


def evaluate_query(inverted_index, query, operations):
    query_terms = query.split()
    if len(query_terms) != len(operations) + 1:
        raise ValueError("Invalid query format")

    # Perform the first operation
    operator = operations[0]
    term1 = query_terms[0]
    term2 = query_terms[1]
    if operator == "AND":
        if term1 not in inverted_index or term2 not in inverted_index:
            result = []
        else:
            result = list(set(inverted_index[term1]) & set(inverted_index[term2]))
    elif operator == "OR":
        documents = set()
        if term1 in inverted_index:
            documents.update(inverted_index[term1])
        if term2 in inverted_index:
            documents.update(inverted_index[term2])
        result = list(documents)
    elif operator == "AND NOT":
        if term1 not in inverted_index or term2 not in inverted_index:
            result = []
        else:
            result = list(set(inverted_index[term1]) & set(not_operation(inverted_index, all_documents, term2)))
    elif operator == "OR NOT":
        if term1 not in inverted_index:
            result = not_operation(inverted_index, all_documents, term2)
        else:
            documents = set(inverted_index[term1])
            documents.update(not_operation(inverted_index, all_documents, term2))
            result = list(documents)
    else:
        raise ValueError("Invalid operator: " + operator)

    # Perform subsequent operations
    for i in range(1, len(operations)):
        operator = operations[i]
        next_term = query_terms[i + 1]
        if operator == "AND":
            result = list(set(result) & set(inverted_index.get(next_term, [])))
        elif operator == "OR":
            documents = set(result)
            if next_term in inverted_index:
                documents.update(inverted_index[next_term])
            result = list(documents)
        elif operator == "AND NOT":
            result = list(set(result) & set(not_operation(inverted_index, all_documents, next_term)))
        elif operator == "OR NOT":
            documents = set(result)
            documents.update(not_operation(inverted_index, all_documents, next_term))
            result = list(documents)
        else:
            raise ValueError("Invalid operator: " + operator)
    
    return result



def preprocess_input(input_sequence):
    preprocessed_sequence = []
    for query in input_sequence:
        preprocessed_query = preprocess_text(query)
        preprocessed_sequence.append(" ".join(preprocessed_query))
    return preprocessed_sequence


if __name__ == "__main__":
    inverted_index = load_inverted_index("inverted_index.pickle")

    N = int(input("Enter the number of queries: "))
    queries = []
    for _ in range(N):
        query = input().strip()
        operations = input().strip().split(", ")
        queries.append((query, operations))

    preprocessed_queries = preprocess_input([query for query, _ in queries])

    for idx, (query, operations) in enumerate(queries):
        query_with_ops = ""
        for token, op in zip(preprocessed_queries[idx].split(), operations):
            query_with_ops += token + " " + op + " "
        query_with_ops += preprocessed_queries[idx].split()[-1]
        result = evaluate_query(inverted_index, preprocessed_queries[idx], operations)
        print(f"Query {idx+1}: {query_with_ops}")
        print(f"Number of documents retrieved for query {idx+1}: {len(result)}")
        print(f"Names of the documents retrieved for query {idx+1}: " + ", ".join([f"{i}" for i in result]))

Query 1: highly AND recommend
Number of documents retrieved for query 1: 32
Names of the documents retrieved for query 1: file750.txt, file838.txt, file621.txt, file213.txt, file18.txt, file457.txt, file627.txt, file208.txt, file400.txt, file487.txt, file997.txt, file268.txt, file138.txt, file716.txt, file156.txt, file897.txt, file501.txt, file576.txt, file618.txt, file831.txt, file149.txt, file543.txt, file541.txt, file639.txt, file291.txt, file461.txt, file527.txt, file228.txt, file242.txt, file699.txt, file186.txt, file872.txt
Query 2: highly OR recommend
Number of documents retrieved for query 2: 96
Names of the documents retrieved for query 2: file438.txt, file621.txt, file213.txt, file863.txt, file626.txt, file592.txt, file400.txt, file257.txt, file590.txt, file199.txt, file661.txt, file851.txt, file19.txt, file576.txt, file839.txt, file985.txt, file916.txt, file84.txt, file228.txt, file211.txt, file861.txt, file130.txt, file748.txt, file838.txt, file301.txt, file18.txt, file382.

In [None]:
import os
import nltk
import string
import pickle

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token.strip() != '']
    return tokens

# Function to load positional index from file
def load_positional_index(pickle_file):
    with open(pickle_file, 'rb') as file:
        positional_index = pickle.load(file)
    return positional_index

# Function to retrieve documents for phrase query using positional index
def retrieve_documents_for_phrase_query(positional_index, phrase_query):
    terms = preprocess_text(phrase_query)
    if len(terms) == 0:
        return []

    # Initialize result with documents containing the first term
    result = set(positional_index.get(terms[0], {}).keys())

    # Iterate through terms in the phrase query
    for term in terms[1:]:
        if term in positional_index:
            # Get documents containing the current term
            documents_with_term = set(positional_index[term].keys())
            # Find documents common with the current result
            result = result.intersection(documents_with_term)

    # Filter documents where phrase query appears
    for document in list(result):
        positions = [positional_index[term][document] for term in terms]
        if not any(check_sequence(positions, i) for i in range(len(positions[0]))):
            result.remove(document)

    return list(result)

# Function to check if a sequence of positions is consecutive
def check_sequence(positions, index):
    return all(positions[i][0] + index in positions[i+1] for i in range(len(positions)-1))

# Example usage
if __name__ == "__main__":
    # Load positional index
    positional_index = load_positional_index("positional_index.pickle")

    # Input
    N = int(input("Enter the number of queries: "))
    queries = []
    for _ in range(N):
        query = input().strip()
        queries.append(query)

    # Preprocess queries
    preprocessed_queries = [preprocess_text(query) for query in queries]

    # Retrieve documents for each query using positional index
    for idx, query in enumerate(preprocessed_queries):
        result = retrieve_documents_for_phrase_query(positional_index, query)
        print(f"Number of documents retrieved for query {idx+1} using positional index: {len(result)}")
        print(f"Names of documents retrieved for query {idx+1} using positional index: " + ", ".join([f"file{i}.txt" for i in result]))
