### Now I will ask user to input queries and then I would find the relevant documents from the indexes

In [1]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def isValid(query):
    #there can be maximum of three words in a boolean query so there would be max of 5 words including operators(AND,OR,NOT)
    query = query.lower().strip() #remove any extra spaces
    if not query:
        print("Query cannot be empty")
        return False

    words = query.split()
    if len(words)>5:
        print("Only 3 words allowed at Max")
        return False
    
    for word in words:
        if not word.isalpha():
            print("Only alphabets allowed")
            return False

    operators = {'and','or','not'}
    for i in range(len(words)):
        if i%2==1:  #operators would be at odd positions
            if words[i] not in operators:
                print("Invalid Operators or invalid format")
                return False
        elif i%2==0:
            if words[i] in operators:
                print("Operator at wrong position")
                return False
    terms = []
    ops = []
    for word in words:
        if word in operators:
            ops.append(word)
        else:
            terms.append(ps.stem(word))

    return terms,ops,True

Query has been validated now I will use intersect algorithm(taught in the course) to retrieve the correct postings. I will show two pointer approach for and operation and then I will utilize set functions to show how other queries are processed

In [2]:
def findDocs(terms,ops,invertedIndex):
    all_docs = set()
    for doc_list in invertedIndex.values():
        all_docs.update(doc_list)

    if len(terms) == 1:
        if terms[0] in invertedIndex:
            return invertedIndex[terms[0]]
        else:
            print(f"Term: {terms[0]} not found in documents")
            return []
    elif len(terms) == 2:
        l1 = []
        l2 = []
        if terms[0] in invertedIndex:
            l1 = invertedIndex[terms[0]]
        else:
            print(f"Term: {terms[0]} not found in documents")
            return []
        if terms[1] in invertedIndex:
            l2 = invertedIndex[terms[1]]
        else:
            print(f"Term: {terms[1]} not found in documents")
            return []
        if len(ops) == 1:
            if ops[0] == 'and':
                ans = []
                i,j = 0,0 #Two pointer approach
                while i<len(l1) and j<len(l2):
                    if l1[i] == l2[j]:
                        ans.append(l1[i])
                        i+=1
                        j+=1
                    elif l1[i]<l2[j]:
                        i+=1
                    else:
                        j+=1
                return ans
            #Could have implemented or and not as well with two pointer but code would have been longer so i used set operations
            if ops[0] == 'or':
                return list(set(l1+l2))  #union
            if ops[0] == 'not':
                return list(set(l1)-set(l2))
    elif len(terms) == 3:
        l1 = []
        l2 = []
        l3 = []
        if terms[0] in invertedIndex:
            l1 = invertedIndex[terms[0]]
        else:
            print(f"Term: {terms[0]} not found in documents")
            return []
        if terms[1] in invertedIndex:
            l2 = invertedIndex[terms[1]]
        else:
            print(f"Term: {terms[1]} not found in documents")
            return []
        if terms[2] in invertedIndex:
            l3 = invertedIndex[terms[2]]
        else:
            print(f"Term: {terms[2]} not found in documents")
        #Logical rule = (term1 op term2) op term 3
        if ops[0] == 'and' and ops[1] == 'and':
            return list(set(l1) & set(l2) & set(l3))

        if ops[0] == 'and' and ops[1] == 'or':
            return list((set(l1) & set(l2)) | set(l3))

        if ops[0] == 'or' and ops[1] == 'and':
            return list((set(l1) | set(l2)) & set(l3))

        if ops[0] == 'or' and ops[1] == 'or':
            return list(set(l1) | set(l2) | set(l3))

        if ops[0] == 'and' and ops[1] == 'not':
            return list(set(l1) & (set(l2) - set(l3)))

        if ops[0] == 'or' and ops[1] == 'not':
            return list((set(l1) | set(l2)) - set(l3))
        
        if ops[0] == 'not' and ops[1] == 'and':
            return list((all_docs - set(l1)) & set(l3))
        
        if ops[0] == 'not' and ops[1] == 'or':
            return list((all_docs - set(l1)) | set(l3))
        
        if ops[0] == 'not' and ops[1] == 'not':
            return list(all_docs - set(l1) - set(l2))
    return []
    


### Now implementing mechanism for achieving proximity query finding

In [3]:
#Query Validation
from nltk.stem.porter import PorterStemmer

def isValidProximity(query):
    query = query.lower().strip() #remove any extra spaces
    if not query:
        print("Query cannot be empty")
        return False

    words = query.split()
    if len(words)!=2:
        print("enter only two words")
        return False
    
    for word in words:
        if not word.isalpha():
            print("Only alphabets allowed")
            return False

    terms = []
    ps = PorterStemmer()
    for word in words:
        terms.append(ps.stem(word))

    return tuple(terms),True

In [4]:
def proximitySearch(terms,k,positionalIndex):

    term1,term2 = terms
    if term1 not in positionalIndex or term2 not in positionalIndex:
        print(f"Either term: {term1} or term: {term2} not found in documents")
        return []
    
    ans = set()
    for doc in positionalIndex[term1].keys() & positionalIndex[term2].keys():  #finds common docs
        pos1 = sorted(positionalIndex[term1][doc])
        pos2 = sorted(positionalIndex[term2][doc])

        i,j = 0,0
        while i<len(pos1) and j<len(pos2):
            if abs(pos1[i]-pos2[j]) <= k:
                ans.add(doc)
                i+=1
            elif pos1[i]<pos2[j]:
                i+=1
            else:
                j+=1
    return list(ans)



### Loading Indexes

In [5]:
invertedIndex = {}
with open('22K4036_invertedIndex.txt','r') as f:
    for line in f:
        term,docIDs = line.strip().split(' -> ')
        invertedIndex[term] = eval(docIDs)


positionalIndex = {}
with open('22K4036_positionalindex.txt','r') as f:
    for line in f:
        term,docIDs = line.strip().split(' -> ')
        positionalIndex[term] = eval(docIDs) 

### Checking my models against the Gold Query Set

In [6]:
for i in range(10):
    query = input("Enter a boolean Query: ")
    terms,ops,res = isValid(query)
    print(f"The query terms are: {terms}\n")
    if res:
        ans = findDocs(terms,ops,invertedIndex)
        print(ans)
        print("------------------------------------------------------\n")
    else:
        print("Invalid Query\n")


The query terms are: ['imag', 'restor']

[359, 375]
------------------------------------------------------

The query terms are: ['deep', 'learn']

[23, 24, 174, 175, 176, 177, 213, 245, 247, 250, 254, 258, 267, 272, 273, 278, 279, 281, 325, 333, 345, 346, 347, 348, 352, 357, 358, 360, 362, 371, 373, 374, 375, 380, 381, 382, 396, 397, 401, 404, 405, 415, 421, 432, 444]
------------------------------------------------------

The query terms are: ['autoencod']

[187, 273, 279, 325, 333, 405]
------------------------------------------------------

The query terms are: ['tempor', 'deep', 'learn']

[358, 373, 405, 279]
------------------------------------------------------

The query terms are: ['time', 'seri']

[40, 54, 110, 111, 112, 113, 158, 163, 173, 180, 181, 202, 220, 237, 238, 239, 240, 258, 277, 283, 295, 305, 350, 405, 421, 437, 438, 445]
------------------------------------------------------

The query terms are: ['time', 'seri', 'classif']

[40, 237, 283, 445]
------------------

In [7]:
for i in range(3):
    query = input("Enter a proximity Query: ")
    terms,res = isValidProximity(query)
    print(f"The query terms are: {terms}\n")
    k = int(input("Enter value of K: "))
    if res:
        ans = proximitySearch(terms,k,positionalIndex)
        print(ans)
        print("------------------------------------------------------\n")
    else:
        print("Invalid Query\n")

The query terms are: ('neural', 'inform')

[]
------------------------------------------------------

The query terms are: ('neural', 'inform')

[26]
------------------------------------------------------

The query terms are: ('featur', 'track')

[212, 13]
------------------------------------------------------

