In [2]:
def create_bigrams(words):
    """
    Given a list of words, returns a list of bigrams
    where each bigram is a pair of consecutive words.
    """
    return [words[i] + ' ' + words[i+1] for i in range(len(words)-1)]

In [3]:
def create_bigram_inverted_index(documents):
    """
    Given a list of documents, returns a bigram inverted index
    where the keys are bigrams and the values are sets of
    document IDs or pointers that contain that bigram.
    """
    inverted_index = {}
    
    for doc_id, document in enumerate(documents):
        words = document.split()
        bigrams = create_bigrams(words)
        
        for bigram in bigrams:
            if bigram not in inverted_index:
                inverted_index[bigram] = set()
            inverted_index[bigram].add(doc_id)
            
    return inverted_index
    

In [4]:
import os

# current directory
curr_dir = os.getcwd()

# dataset path

path = curr_dir+'\Dataset'
print(path)   

# all files
files = os.listdir(path)
print(files)

C:\Users\Rohit Kesarwani\python_files\Ashutosh_assign\Dataset
['cranfield0001', 'cranfield0002', 'cranfield0003', 'cranfield0004', 'cranfield0005', 'cranfield0006', 'cranfield0007', 'cranfield0008', 'cranfield0009', 'cranfield0010', 'cranfield0011', 'cranfield0012', 'cranfield0013', 'cranfield0014', 'cranfield0015', 'cranfield0016', 'cranfield0017', 'cranfield0018', 'cranfield0019', 'cranfield0020', 'cranfield0021', 'cranfield0022', 'cranfield0023', 'cranfield0024', 'cranfield0025', 'cranfield0026', 'cranfield0027', 'cranfield0028', 'cranfield0029', 'cranfield0030', 'cranfield0031', 'cranfield0032', 'cranfield0033', 'cranfield0034', 'cranfield0035', 'cranfield0036', 'cranfield0037', 'cranfield0038', 'cranfield0039', 'cranfield0040', 'cranfield0041', 'cranfield0042', 'cranfield0043', 'cranfield0044', 'cranfield0045', 'cranfield0046', 'cranfield0047', 'cranfield0048', 'cranfield0049', 'cranfield0050', 'cranfield0051', 'cranfield0052', 'cranfield0053', 'cranfield0054', 'cranfield0055', 'c

In [5]:
import os
import re
import pickle

curr_dir = os.getcwd()

# dataset path

directory = curr_dir+'\Dataset'
print(curr_dir)
# directory = "/content/drive/MyDrive/data/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset"
documents = []
doc_list = os.listdir(directory)
doc_list.sort(key=lambda f: int(re.sub('\D', '', f)))
for filename in doc_list:
    file_path = os.path.join(directory, filename)
    with open(file_path, "r") as f:
        documents.append(f.read())
        # print(documents)
# print(documents)
inverted_index = create_bigram_inverted_index(documents)
print(inverted_index)

# Save inverted index to disk
bigram_pkl = "bigram_index_file.pkl"
with open(bigram_pkl, "wb") as f:
    pickle.dump(inverted_index, f)

C:\Users\Rohit Kesarwani\python_files\Ashutosh_assign


In [6]:
def create_positional_index(documents):
    """
    Given a list of documents, returns a positional index
    where the keys are terms and the values are dictionaries
    that map document IDs to lists of positions where the term
    appears in the document.
    """
    index = {}
    
    for doc_id, document in enumerate(documents):
        words = document.split()
        
        for position, word in enumerate(words):
            if word not in index:
                index[word] = {}
            if doc_id not in index[word]:
                index[word][doc_id] = []
            index[word][doc_id].append(position)
            
    return index

In [7]:
import os
import re
import pickle

curr_dir = os.getcwd()
directory = curr_dir+'\Dataset'

# directory = "/content/drive/MyDrive/data/CSE508_Winter2023_Dataset/CSE508_Winter2023_Dataset"
documents = []
doc_list = os.listdir(directory)
doc_list.sort(key=lambda f: int(re.sub('\D', '', f)))
for filename in doc_list:
    file_path = os.path.join(directory, filename)
    with open(file_path, "r") as f:
        documents.append(f.read())
        
positional_index = create_positional_index(documents)
print(positional_index)




In [8]:
# Save Positional index to disk
positional_pkl = "positional_index_file.pkl"

with open(positional_pkl, "wb") as f:
    pickle.dump(positional_index, f)

In [9]:
import os
import pickle
from collections import defaultdict
import re
from nltk.stem import WordNetLemmatizer
import pandas as pd
from nltk.corpus import stopwords
import string 
import warnings
warnings.filterwarnings('ignore')

In [10]:
def preprocess(text):
    lem = WordNetLemmatizer()
    text = text.lower()
    text = re.sub('[^A-Z a-z ]+', ' ', text)
    text = text.split()
    tokens = []
    for j in text :
        tokens.append(j)
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [token for token in tokens if token.strip()]
    tokens = [lem.lemmatize(token) for token in tokens]
    tokens = ' '.join(tokens)
    return tokens

In [11]:
import os
import re
curr_dir = os.getcwd()

path = curr_dir+'\Dataset'
print(path)   

files = os.listdir(path)
print(files)

C:\Users\Rohit Kesarwani\python_files\Ashutosh_assign\Dataset
['cranfield0001', 'cranfield0002', 'cranfield0003', 'cranfield0004', 'cranfield0005', 'cranfield0006', 'cranfield0007', 'cranfield0008', 'cranfield0009', 'cranfield0010', 'cranfield0011', 'cranfield0012', 'cranfield0013', 'cranfield0014', 'cranfield0015', 'cranfield0016', 'cranfield0017', 'cranfield0018', 'cranfield0019', 'cranfield0020', 'cranfield0021', 'cranfield0022', 'cranfield0023', 'cranfield0024', 'cranfield0025', 'cranfield0026', 'cranfield0027', 'cranfield0028', 'cranfield0029', 'cranfield0030', 'cranfield0031', 'cranfield0032', 'cranfield0033', 'cranfield0034', 'cranfield0035', 'cranfield0036', 'cranfield0037', 'cranfield0038', 'cranfield0039', 'cranfield0040', 'cranfield0041', 'cranfield0042', 'cranfield0043', 'cranfield0044', 'cranfield0045', 'cranfield0046', 'cranfield0047', 'cranfield0048', 'cranfield0049', 'cranfield0050', 'cranfield0051', 'cranfield0052', 'cranfield0053', 'cranfield0054', 'cranfield0055', 'c

In [13]:
query_number = int(input())

for query in range(query_number):
    query = input("Enter query")
    processed_query = preprocess(query)
    print(processed_query)
    query_bigram = create_bigram_inverted_index([processed_query])
#     print(query_bigram)

    number_of_documents = 0
    for i in query_bigram.keys():
        if i in inverted_index.keys():
            number_of_documents += len(inverted_index[i])
            documentIds = list(inverted_index[i])
            
            for file in documentIds:
                print(files[file])
    print("Number of documents: ",number_of_documents)

1
Enter queryheating rate may occur
heating rate may occur
cranfield0164
cranfield0005
cranfield0522
cranfield0717
cranfield1104
cranfield0982
cranfield0635
cranfield1344
cranfield0005
cranfield0962
cranfield0005
cranfield1071
cranfield1393
cranfield0626
cranfield0822
Number of documents:  15


In [14]:
query_number = int(input())

for query in range(query_number):
    query = input("Enter query")
    processed_query = preprocess(query)
    print(processed_query)
    query_positional = create_positional_index([processed_query])
#     print(query_positional)

    number_of_documents = 0
    for i in query_positional.keys():
        if i in positional_index.keys():
            number_of_documents += len(positional_index[i])
            documentIDs = []
            documentIDs_new = list(positional_index[i])
            for ids in documentIDs_new:
                if ids not in documentIDs:
                    documentIDs.append(ids)
#             print(documentIDs)    
            for file in documentIDs:
                print(files[file])
    print("Number of documents: ",number_of_documents)

2
Enter queryheating rate may occur
heating rate may occur
cranfield0005
cranfield0029
cranfield0051
cranfield0066
cranfield0077
cranfield0102
cranfield0123
cranfield0135
cranfield0142
cranfield0158
cranfield0163
cranfield0164
cranfield0294
cranfield0342
cranfield0378
cranfield0395
cranfield0406
cranfield0438
cranfield0481
cranfield0500
cranfield0509
cranfield0522
cranfield0546
cranfield0553
cranfield0554
cranfield0579
cranfield0606
cranfield0620
cranfield0621
cranfield0635
cranfield0707
cranfield0711
cranfield0715
cranfield0717
cranfield0746
cranfield0767
cranfield0859
cranfield0860
cranfield0865
cranfield0877
cranfield0886
cranfield0966
cranfield0968
cranfield0978
cranfield0981
cranfield0982
cranfield1027
cranfield1097
cranfield1098
cranfield1104
cranfield1106
cranfield1147
cranfield1177
cranfield1178
cranfield1219
cranfield1250
cranfield1305
cranfield1344
cranfield1346
cranfield1354
cranfield1361
cranfield1379
cranfield0005
cranfield0024
cranfield0059
cranfield0061
cranfield0088
cra

In [15]:
import sys
query_number = int(input())

for query_n in range(query_number):
    query = input("Enter query: ")
    len_query = len(query.split(" "))
    if len_query>5:
        print("query exceeds length 5")
        sys.exit()
        
#     print(len_query)
    query = ' '.join(query)
    processed_query = preprocess(query)
#     print(processed_query)
    query_bigram = create_bigram_inverted_index([processed_query])
#     print(query_bigram)

    print("Name of documents retrieved from query", query_n+1 ,"using bigram inverted index:", end=" ")
    number_of_documents = 0
    for i in query_bigram.keys():
        if i in inverted_index.keys():
            number_of_documents += len(inverted_index[i])
            documentIds = list(inverted_index[i])
            
            for file in documentIds:
                print(files[file], end=",")
    print()
    print("Number of documents retrieved from query", query_n+1 ,"using bigram inverted index:", number_of_documents)

    
    query_positional = create_positional_index([processed_query])
#     print(query_positional)

    print("Name of documents retrieved from query", query_n+1 ,"using positional index:", end=" ")
    number_of_documents = 0
    documentIDs = []
    for i in query_positional.keys():
        if i in positional_index.keys():
            number_of_documents += len(positional_index[i])
            documentIDs_new = list(positional_index[i])
            for ids in documentIDs_new:
                if ids not in documentIDs:
                    documentIDs.append(ids)
#             print(documentIDs)    
    for file in documentIDs:
        print(files[file],end=",")
    print()
    print("Number of documents retrieved from query", query_n+1 ,"using positional index:",number_of_documents)


2
Enter query: heating rate may occur
Name of documents retrieved from query 1 using bigram inverted index: cranfield0868,cranfield0522,cranfield1327,cranfield0654,
Number of documents retrieved from query 1 using bigram inverted index: 4
Name of documents retrieved from query 1 using positional index: cranfield0008,cranfield0148,cranfield0156,cranfield0174,cranfield0259,cranfield0371,cranfield0377,cranfield0493,cranfield0506,cranfield0548,cranfield0606,cranfield0616,cranfield0620,cranfield0622,cranfield0667,cranfield0731,cranfield0739,cranfield0811,cranfield0827,cranfield1056,cranfield1061,cranfield1324,cranfield1365,cranfield0014,cranfield0018,cranfield0021,cranfield0085,cranfield0099,cranfield0131,cranfield0136,cranfield0206,cranfield0213,cranfield0229,cranfield0239,cranfield0252,cranfield0257,cranfield0260,cranfield0262,cranfield0263,cranfield0300,cranfield0315,cranfield0332,cranfield0410,cranfield0414,cranfield0415,cranfield0428,cranfield0429,cranfield0437,cranfield0443,cranfield0