# A Boolean Retrieval System

In [1]:
from functools import total_ordering, reduce  # not essential but reduces the code we have to write
import csv     # for csv files
import re      # for regular expressions
import pickle  # to save the index
import time
import os.path
import copy

## Postings

A `Posting` object is simply the docID of a document. It has a method `get_from_corpus` that given the corpus retrieves the document corresponding to that docID. Then it has some comparison methods to check if two docID are equal, one greater than the other, etc.

In [2]:
@total_ordering   # takes a class where we have defined at least the methods `eq` and `gt`/`lt` and defines in a consistent way all the other methods (otherwise we should implement them all by hand)
class Posting:
    
    def __init__(self, docID):
        """ Class constructor.
        """
        self._docID = docID
        
    def get_from_corpus(self, corpus):  # return from the corpus the doc corresponding to that docID. In the list you only save the docID, not the all document
        """ Returns the document corresponding to that docID from the corpus.
        """
        return corpus[self._docID]
    
    def __eq__(self, other: 'Posting'):  # euqality comparator
        """ Performs the comparison between this posting and another one.
        Since the ordering of the postings is only given by their docID,
        they are equal when their docIDs are equal.
        """
        return self._docID == other._docID
    
    def __gt__(self, other: 'Posting'):  # greather than comparator
        """ As in the case of __eq__, the ordering of postings is given
        by the ordering of their docIDs.
        """
        return self._docID > other._docID
    
    def __repr__(self):       # for debagging purposes to print the class
        """ String representation of the class.
        """
        return str(self._docID)

## Posting Lists

A `PostingList` object is a list of `Posting`s. You can construct an empty `PostingList` with `__init__`, or construct and initialize a `PostingList` directly with one docID with `from_docID`, or you can create a `PostingList` object with an already existing list using `from_posting_list`. Then you can merge two posting list with `merge` (the one in input will be added at the end of the one on which the mehod `merge` is called, without any checking on the total ordering of the list), you can intersect them with `intersection` or you can unify them with `union`. With `get_from_corpus` we can retrieve the documents corresponding to the docID stored in this `PostingList`.

In [3]:
class PostingList:

    _postings: list
    
    def __init__(self):
        """ Class constructor.
        """
        self._postings = []    # it has as an attribute a list of posting
        
    @classmethod     # to define another constructor. It will return another PostingList like a constructor
    def from_docID(cls, docID):
        """ A posting list can be constructed starting from a single docID.
        """
        plist = cls()
        plist._postings = [(Posting(docID))]
        return plist
    
    @classmethod
    def from_posting_list(cls, postingList):
        """ A posting list can also be constructed by using another posting list.
        """
        plist = cls()
        plist._postings = postingList   # we use it as the postins of this PostingList
        return plist
    
    def merge(self, other: 'PostingList'):  # we have to merge postinglists
        """ Merges the other posting list to this one in a desctructive
        way, i.e., modifying the current posting list. This method assumes
        that all the docIDs of the second list are higher than the ones
        in this list. It assumes the two posting lists to be ordered
        and non-empty. Under those assumptions duplicate docIDs are
        discarded.
        """
        i = 0
        last = self._postings[-1]   # the self element of the current postinglist
        while (i < len(other._postings) and last == other._postings[i]):  # we can have the same docID multiple times and when e merge them we don't want them multiple times
            i += 1
        self._postings += other._postings[i:]
        
    def intersection(self, other: 'PostingList'):
        """ Returns a new posting list resulting from the intersection
        of this one and the one passed as argument.
        """
        intersection = []
        i = 0
        j = 0
        while (i < len(self._postings) and j < len(other._postings)):  # until we reach the end of a posting list
            if (self._postings[i] == other._postings[j]):
                intersection.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                i += 1
            else:
                j += 1
        return PostingList.from_posting_list(intersection)
    
    def union(self, other: 'PostingList'):
        """ Returns a new posting list resulting from the union of this
        one and the one passed as argument.
        """
        union = []
        i = 0
        j = 0
        while (i < len(self._postings) and j < len(other._postings)):
            if (self._postings[i] == other._postings[j]):
                union.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                union.append(self._postings[i])   # because i is the smallest one
                i += 1
            else:
                union.append(other._postings[j]) 
                j += 1
        for k in range(i, len(self._postings)):  # we have to append the remaining elements of the non emptied list
            union.append(self._postings[k])
        for k in range(j, len(other._postings)):
            union.append(other._postings[k])
        return PostingList.from_posting_list(union)

    def difference(self, other: 'PostingList'):
      difference = []

      return PostingList.from_posting_list(difference)
    
    def get_from_corpus(self, corpus):   # used when we have a posting list that is the result of a query, but I don't want the docID, I want the docs!
        return list(map(lambda x: x.get_from_corpus(corpus), self._postings))  # I return a list of documents
    
    def __getitem__(self, key):
        return self._postings[key]
    
    def __len__(self):
        return len(self._postings)
    
    def __repr__(self):
        return ", ".join(map(str, self._postings))

## Terms

A `Term` object contains both the word itself and the `PostingList` with all the docIDs of the documents in which the word is contained. The `merge` function merges the `PostingList`s of two equal `Term`s. Then we have some comparison methods to check if two `Term`s are equal or one is greater then the other, etc.

In [4]:
class ImpossibleMergeError(Exception):
    pass

@total_ordering  # to have all the ordering methods defined automatically
class Term:

    posting_list: PostingList
    
    def __init__(self, term, docID):   # we create a term with a DocID, we sort them and we merge the equal terms
        self.term = term
        self.posting_list = PostingList.from_docID(docID)
        
    def merge(self, other: 'Term'):   # when we merge two terms
        """ Merges (destructively) this term and the corresponding posting list
        with another equal term and its corrsponding posting list.
        """
        if (self.term == other.term): # cannot merge posting lists with different terms!
            self.posting_list.merge(other.posting_list)  # merge the current posting list with the one of the other
        else: 
            raise ImpossibleMergeError # (some kind of error) error of impossible merge
            
    def __eq__(self, other: 'Term'):
        return self.term == other.term
    
    def __gt__(self, other: 'Term'):
        return self.term > other.term
    
    def __repr__(self):
        return self.term + ": " + repr(self.posting_list)

## Inverted Index

In [5]:
# We have to do some step of tokenization and normalization

def normalize(text):
    """ A simple funzion to normalize a text.
    It removes everything that is not a word, a space or an hyphen
    and downcases all the text.
    """
    no_punctuation = re.sub(r'[^\w^\s^-]', '', text)  # the text that matches a certain pattern will be substittuted with the second expression. ^\w → not something alphanumeric, ^\s → not some space, ^- → not a dash, replace it with '', the empty string
    downcase = no_punctuation.lower()  # put everything to lower case
    return downcase

def tokenize(movie: 'MovieDescription'):
    """ Returns a list, which is a posting list, from a movie
    description of all tokens present in the description.
    """
    text = normalize(movie.description)
    return list(text.split())

Function to print a progress bar, taken from [here](https://stackoverflow.com/questions/3160699/python-progress-bar).

In [6]:
import time, sys

def update_progress(progress):
    """ Displays or updates a console progress bar.
    Accepts a float between 0 and 1. Any int will be converted to a float.
    A value under 0 represents a 'halt'.
    A value at 1 or bigger represents 100%
    """
    barLength = 40 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done!\r\n"
    block = int(round(barLength*progress))
    text = "\r[{0}] {1}% {2}".format( "#"*block + "."*(barLength-block), round(progress*100, 2), status)
    sys.stdout.write(text)
    sys.stdout.flush()

An `InvertedIndex` object contains a dictionary with as keys the words and as values the `Term` associated to that word, which, we reall, contains the `PostingList` associated to the word.

#### Data structure

Python dictionaries aren’t always what you need: the most important case is where you want to store a very large mapping. When a Python dictionary is accessed, the whole dictionary has to be unpickled and brought into memory.

BTrees are a balanced tree data structure that behave like a binary tree but distribute keys throughout a number of tree nodes and each node has between $a$ and $b$ children. The nodes are stored in sorted order. Nodes are then only unpickled and brought into memory as they’re accessed, so the entire tree doesn’t have to occupy memory (unless you really are touching every single key).

In [7]:
from typing import List

class InvertedIndex:

    _dictionary: List
    complete_plist: PostingList
    
    def __init__(self):
        self._dictionary = []
        self.complete_plist = PostingList() # PostingList of all the documents
        
    @classmethod  # instead of having this method associated to a specific instance/object of the class InvertedIndex we write InvertedIndex.from_corpus(). Because you can have only one __init__ method, so you use @classmethod to have multiple constructors. It's like a static method in Java
    def from_corpus(cls, corpus: list):
        # Here we "cheat" by using python dictionaries
        intermediate_dict = {}   # we cheat a little bit and use a Python dictionary → we should create a big list, sort it and merge everything
        print("Processing the corpus to create the index...")
        for docID, document in enumerate(corpus): # NB: corpus: collection (list) of objects of type MovieDescription
            if docID == 0:
                plist = PostingList.from_docID(docID)
            else:
                plist.merge(PostingList.from_docID(docID)) # I update the PostingList of all the docs
            tokens = tokenize(document) # document is a MovieDescription object
            for token in tokens:
                term = Term(token, docID)
                try:
                    intermediate_dict[token].merge(term)  # I merge the two posting lists → Term.merge() which calls PostingList.merge()
                except KeyError:
                    intermediate_dict[token] = term # for when the term is not present in the dict
            # To observe the progressing of our indexing
            update_progress(docID/len(corpus))
        
        idx = cls()  # we call the constructor of the class = InvertedIndex
        idx._dictionary = sorted(intermediate_dict.values())  # list of all the sorted terms
        idx.complete_plist = plist
        return idx
    
    def __getitem__(self, key): # indexing the inverted index using as keys the terms
        for term in self._dictionary:  # we could do a binary search
            if term.term == key:
                return term.posting_list  # quering the index with a  word returns the PostingList associated to that word
        raise KeyError(f"The term '{key}' is not present in the index.") # the key is not present!
        
    def __repr__(self):
        return "A dictionary with " + str(len(self._dictionary)) + " terms"

## Reading the Corpus

A `MovieDescription` object has a title and a description.  We have some comparison methods to check if two `MovieDescription`s are equal or one is greater then the other, etc. The function `hash` computes the hash of a `MovieDescription` using the hash of its title and its description.

We have implemented the comparison methods to make `MovieDescription` a sortable object (so we can iterate on it), and the `hash` method to make it hashable (so we can put it in a `set`).

In [8]:
@total_ordering
class MovieDescription:  # container for all the info we have about the movie
    
    def __init__(self, title: str, description: str):
        self.title = title
        self.description = description
        
    def __eq__(self, other: 'MovieDescription'):
        return self.title == other.title
    
    def __gt__(self, other: 'MovieDescription'):
        return self.title > other.title

    def __hash__(self):
      return hash((self.title, self.description))
        
    def __repr__(self):
        return self.title  # + "\n" + self.description + "\n"

In [9]:
def read_movie_descriptions():
    filename = 'data/plot_summaries.txt'   # not very portable but done for the sake of simplicity
    movie_names_file = 'data/movie.metadata.tsv'
    with open(movie_names_file, 'r') as csv_file:
        movie_names = csv.reader(csv_file, delimiter = '\t')   # we define the csv reader
        names_table = {}   # Python dictionary with all the names of the films: key = movieID, value = movie title
        for name in movie_names:
            names_table[name[0]] = name[2] # the first element is the ID, the third elemnt is the title
    # Now we have all the associations between ID and title, we miss the move description

    with open(filename, 'r') as csv_file:
        descriptions = csv.reader(csv_file, delimiter = '\t')
        corpus = []   # collection (list) of objects of type MovieDescription
        for desc in descriptions:
            try:      # at least in this dataset there are some errors so some descriptions have not a matching ID
                movie = MovieDescription(names_table[desc[0]], desc[1]) # the first element is the ID, the second the description
                corpus.append(movie)
            except KeyError:  # in case we don't find the title associated to that ID
                # We ignore the descriptions for which we cannot find a title
                pass
        return corpus

## Edit distance

By computing the edit distance we can find the set of words that are the closest to a misspelled word. However, computing the edit distance on the entire dictionary can be too expensive. We can use some heuristics to limit the number of words, like looking only at words with the same initial letter (hopefully this has not been misspelled).

In [10]:
def edit_distance(u, v, print = False):
    """ Computes the edit (or Levenshtein) distance between two words u and v.
    """
    nrows = len(u) + 1
    ncols = len(v) + 1
    M = [[0] * ncols for i in range(0, nrows)]  # matrix all filled with zeros
    for i in range(0, nrows):  # we fill the first row, the trivial one
        M[i][0] = i
    for j in range(0, ncols):  # we fill the first col, the trivial one
        M[0][j] = j
    for i in range(1, nrows):
        for j in range(1, ncols):
            candidates = [M[i-1][j] + 1, M[i][j-1] + 1]
            if (u[i-1] == v[j-1]):
                candidates.append(M[i-1][j-1])
            else:
                candidates.append(M[i-1][j-1] + 1)
            M[i][j] = min(candidates)
            # To print the distance matrix
            if print:
                print(M[i][j], end="\t")
        if print:
          print()
    return M[-1][-1]  # Bottom right element of M (-1 means the last element)

def find_nearest(word, dictionary, keep_first=False):
    if keep_first:
        # If keep_first is true then we only search across the words in the dictionary starting with the same letter
        dictionary = [w for w in dictionary if w[0] == word[0]]
    # Remove comment to see the reduction in the size of the dictionary when keeping fixed the first letter
    #print(len(dictionary))
    # Apply f(x) = edit_distance(word, x) to all words in the dictionary
    distances = map(lambda x: edit_distance(word, x), dictionary)
    # Produce all the pairs (distance, term) usng zip and find one with the minimal distance.
    return min(zip(distances, dictionary))[1]

## IR System

An `IRsystem` object contains the entire corpus and the `InvertedIndex`.

In [11]:
class IRsystem:

    _corpus: list
    _index: InvertedIndex
    
    def __init__(self, corpus: list, index: 'InvertedIndex'):
        self._corpus = corpus
        self._index = index
        
    @classmethod
    def from_corpus(cls, corpus: list): # generate the entire inverted index calling the constructor
        index = InvertedIndex.from_corpus(corpus)
        return cls(corpus, index)  # retrun the constructor when we have yet the index

    def get_from_corpus(self, plist):
        return plist.get_from_corpus(self._corpus)

    def spelling_correction(self, norm_words: List[str]):
        postings = []
        for w in norm_words:
            try:
                res = self._index[w]
            except KeyError:
                dictionary = [t.term for t in self._index._dictionary]
                sub = find_nearest(w, dictionary, keep_first=True)
                print("{} not found. Did you mean {}?".format(w, sub))
                res = self._index[sub]
            postings.append(res)
        print()
        return postings

    def answer_and_query(self, words: List[str], spellingCorrection = False):
        """ AND-query, if `spellingCorrection` is `True` with spelling correction
        """
        norm_words = map(normalize, words)  # Normalize all the words. IMPORTANT!!! If the user uses upper-case we will not have ANY match! We have to perform the same normalization of the docs in the corpus on the query!
        if not spellingCorrection:
            postings = map(lambda w: self._index[w], norm_words) # get the posting list for each word → list of posting lists
        else:
            postings = self.spelling_correction(norm_words)
        plist = reduce(lambda x, y: x.intersection(y), postings)  # apply the function to the two items of the list, then apply it to the result with the third, then the result with the fourt term and so on until the end of the list
        return self.get_from_corpus(plist)

    def answer_or_query(self, words: List[str], spellingCorrection = False):
        """ OR-query, if `spellingCorrection` is `True` with spelling correction
        """
        norm_words = map(normalize, words)
        if not spellingCorrection:
            postings = map(lambda w: self._index[w], norm_words)
        else:
            postings = self.spelling_correction(norm_words)
        plist = reduce(lambda x, y: x.union(y), postings)
        return self.get_from_corpus(plist)

    def answer_not_query(self, words: List[str], spellingCorrection = False):
        """ NOT-query
        """
        norm_words = map(normalize, words)
        if not spellingCorrection:
            postings = map(lambda w: self._index[w], norm_words)
        else:
            postings = self.spelling_correction(norm_words)
        words_plist = reduce(lambda x, y: x.union(y), postings)
        print(len(words_plist), type(words_plist._postings))
        plist = copy.deepcopy(self._index.complete_plist)
        print(len(plist))
        for i in words_plist:
            if i in plist:
                plist._postings.remove(i)
        print(len(plist))
        return self.get_from_corpus(plist)

    def answer_query(self, op: str, words = None, word = None, posting = None):
        if words:
            norm_words = map(normalize, words)
            postings = map(lambda w: self._index[w], norm_words)
            if op == 'AND':
                plist = reduce(lambda x, y: x.intersection(y), postings)
            elif op == 'OR':
                plist = reduce(lambda x, y: x.union(y), postings)
            elif op == 'NOT':
                pass # plist.remove() !!!!!!!!!!!!!!!!!!!!
        else:
            norm_word = normalize(word)
            word_posting = self._index[norm_word]
            if op == 'AND':
                plist = word_posting.intersection(posting)
            elif op == 'OR':
                plist = word_posting.union(posting)
            elif op == 'NOT':
                pass #postings.remove(word) # !!!!!!!!!!!!!!!!!!!!
        return plist

In [12]:
plist = PostingList.from_docID(0)
plist.merge(PostingList.from_docID(7))
plist.merge(PostingList.from_docID(10))
plist2 = PostingList.from_docID(7)
plist2.merge(PostingList.from_docID(8))

print(f"plist: {plist}")
print(f"plist2: {plist2}")

for i in plist2._postings:
  if i in plist:
    plist._postings.remove(i)

print(f"plist: {plist}")

plist: 0, 7, 10
plist2: 7, 8
plist: 0, 10


### Queries

In [13]:
def and_query(ir: IRsystem, text: str, noprint=True):
    words = text.split()
    answer = ir.answer_and_query(words)  # list of documents
    if not noprint:
        for movie in answer:
            print(movie)
    return answer
        
def or_query(ir: IRsystem, text: str, noprint=True):
    words = text.split()
    answer = ir.answer_or_query(words)
    if not noprint:
        for movie in answer:
            print(movie)
    return answer

def not_query(ir: IRsystem, text: str, noprint=True):
    words = text.split()
    answer = ir.answer_not_query(words)
    if not noprint:
        for movie in answer:
            print(movie)
    return answer

def query(ir: IRsystem, text: str, noprint=True):
    """ This query can answer to any type of query, also complex ones. Use 'AND', 'OR' and 'NOT'
    and parenthesis to specify how to combine the words in the query.
    E.g. text = "(yoda AND darth) OR Gandalf NOT love"
    """
    text.replace('(', '( ', text.count('(')).replace(')', ' )', text.count(')')) # add a space after '(' and before ')' so to split them into separate tokens
    words = text.split()
    for i, w in enumerate(words):
        if w == "(":
            pass
        elif w == 'AND':
            if i == 1:
                print(f"i = {i}, words_ {[words[i-1], words[i+1]]}")
                partial_answer = ir.answer_query(op = w, words = [words[i-1], words[i+1]])
            else:
                partial_answer = ir.answer_query(op = w, word = words[i+1], posting = partial_answer)
        elif w == 'OR':
            if i == 1:
                print(f"i = {i}, words_ {[words[i-1], words[i+1]]}")
                partial_answer = ir.answer_query(op = w, words = [words[i-1], words[i+1]])
            else:
                partial_answer = ir.answer_query(op = w, word = words[i+1], posting = partial_answer)
        elif w == 'NOT':
            pass # !!!!!!!!!!!!!!!!!!!!
    answer = ir.get_from_corpus(partial_answer)
    if not noprint:
        for movie in answer:
            print(movie)
    return answer

### Queries with spelling correction

In [14]:
def and_query_sc(ir: IRsystem, text: str, noprint=True):
    words = text.split()
    answer = ir.answer_and_query(words, spellingCorrection=True)
    if not noprint:
        for movie in answer:
            print(movie)
    return answer

def or_query_sc(ir: IRsystem, text: str, noprint=True):
    words = text.split()
    answer = ir.answer_or_query(words, spellingCorrection=True)
    if not noprint:
        for movie in answer:
            print(movie)
    return answer

## Test queries

### Initialization

In [15]:
corpus = read_movie_descriptions()
len(corpus)

42204

#### Saving / loading the index

We will save the index using `Pickle`. `Pickle` is used for serializing and de-serializing Python object structures, also called marshalling or flattening. Serialization refers to the process of converting an object in memory to a byte stream that can be stored on disk or sent over a network. Later on, this character stream can then be retrieved and de-serialized back to a Python object.

In [16]:
updated = True

filename = "index.pickle"

# If the index is saved and it is updated I load it, otherwise I create it and save it
if os.path.isfile(filename) and updated:
    print ("Index file exists. Loading the index...")
    # load the index
    tic = time.time()
    infile = open(filename, 'rb')
    idx = pickle.load(infile)
    infile.close()
    toc = time.time()
    print("Index loaded.")
    print(f"Time: {round(toc-tic, 3)}s")
else:
    print ("Index file does not exist.")
    tic = time.time()
    idx = InvertedIndex.from_corpus(corpus)
    toc = time.time()
    print(f"\n\nTime: {round(toc-tic, 3)}s")
    # save the index
    outfile = open(filename, 'wb')
    pickle.dump(idx, outfile)
    outfile.close()

Index file exists. Loading the index...
Index loaded.
Time: 12.507s


In [17]:
print(idx)

A dictionary with 194757 terms


In [18]:
i=3907  # the term "a"
print(type(idx._dictionary[i]))
print(idx._dictionary[i])
print(idx._dictionary[i].term)
print(f"len: {len(idx._dictionary[i].posting_list)}")

<class '__main__.Term'>
a: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 117, 118, 119, 120, 121, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 2

In [19]:
ir = IRsystem(corpus, idx)

### AND queries

In [20]:
fg_and_query = and_query(ir, "frodo Gandalf", noprint=False)

The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
The Hunt for Gollum
The Return of the King
Date Movie
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Return of the King


In [21]:
yld_and_query = and_query(ir, "yoda Luke darth", noprint=False)

Star Wars Episode V: The Empire Strikes Back
Something, Something, Something Dark Side
Return of the Ewok
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
It's a Trap!


In [22]:
try:
    and_query(ir, "thig")
except KeyError:
    print(sys.exc_info()[1])

"The term 'thig' is not present in the index."


In [23]:
frodo_query = and_query(ir, "frodo")
frodo_set = set(frodo_query)

gandalf_query = and_query(ir, "Gandalf")
gandalf_set = set(gandalf_query)

fg_and_set = frodo_set.intersection(gandalf_set)

assert set(fg_and_query) == fg_and_set

In [24]:
yoda_query = and_query(ir, "yoda")
yoda_set = set(yoda_query)

luke_query = and_query(ir, "Luke")
luke_set = set(luke_query)

darth_query = and_query(ir, "darth")
darth_set = set(darth_query)

yld_and_set = yoda_set.intersection(luke_set).intersection(darth_set)

assert set(yld_and_query) == yld_and_set

### AND queries with spelling correction

In [25]:
mispelled_and_query = and_query_sc(ir, "yioda lukke darhth", noprint=False)

yioda not found. Did you mean yoda?
lukke not found. Did you mean luke?
darhth not found. Did you mean darth?

Star Wars Episode V: The Empire Strikes Back
Something, Something, Something Dark Side
Return of the Ewok
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
It's a Trap!


In [26]:
assert yld_and_query == mispelled_and_query

### OR queries

In [27]:
fy_or_query = or_query(ir, "frodo yoda", noprint=False)

Star Wars Episode V: The Empire Strikes Back
Star Wars Episode II: Attack of the Clones
George Lucas in Love
The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
Something, Something, Something Dark Side
The Hunt for Gollum
The Return of the King
Return of the Ewok
Aliens in the Wild, Wild West
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
Star Wars: The Clone Wars
Date Movie
Gulliver's Travels
Lego Star Wars: The Quest for R2-D2
The Lord of the Rings: The Two Towers
It's a Trap!
The Lord of the Rings: The Return of the King
LEGO Star Wars: Revenge of the Brick


In [28]:
# frodo_query.extend(yoda_query)  # then print 'frodo_query' !!!!
fy_or_set = set(frodo_query + yoda_query)

assert set(fy_or_query) == fy_or_set

In [29]:
fyg_or_query = or_query(ir, "frodo yoda gandalf", noprint=False)

Star Wars Episode V: The Empire Strikes Back
Imaginationland Episode II
Star Wars Episode II: Attack of the Clones
George Lucas in Love
The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
Something, Something, Something Dark Side
The Hunt for Gollum
The Return of the King
Return of the Ewok
Aliens in the Wild, Wild West
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
Star Wars: The Clone Wars
Date Movie
Gulliver's Travels
Lego Star Wars: The Quest for R2-D2
The Lord of the Rings: The Two Towers
It's a Trap!
The Lord of the Rings: The Return of the King
LEGO Star Wars: Revenge of the Brick


In [30]:
fyg_or_set = set(frodo_query + yoda_query + gandalf_query)

assert set(fyg_or_query) == fyg_or_set

In [31]:
love_query = and_query(ir, "love")
fyl_or_query = or_query(ir, "frodo yoda love")
fyl_or_set = set(frodo_query + yoda_query + love_query)

assert set(fyl_or_query) == fyl_or_set

### OR queries with spelling correction

In [32]:
mispelled_or_query = or_query_sc(ir, "frodoo yioda ganalf", noprint=False)

frodoo not found. Did you mean frodo?
yioda not found. Did you mean yoda?
ganalf not found. Did you mean gandalf?

Star Wars Episode V: The Empire Strikes Back
Imaginationland Episode II
Star Wars Episode II: Attack of the Clones
George Lucas in Love
The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
Something, Something, Something Dark Side
The Hunt for Gollum
The Return of the King
Return of the Ewok
Aliens in the Wild, Wild West
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
Star Wars: The Clone Wars
Date Movie
Gulliver's Travels
Lego Star Wars: The Quest for R2-D2
The Lord of the Rings: The Two Towers
It's a Trap!
The Lord of the Rings: The Return of the King
LEGO Star Wars: Revenge of the Brick


In [33]:
assert fyg_or_query == mispelled_or_query

### NOT queries

In [34]:
a_not_query = not_query(ir, "a", noprint=False)

40646 <class 'list'>
42204
1558
The Dancing Fool
Wait Until Spring, Bandini
The Black Corsair
Pardon My Terror
Anything Else
Bowling Balls
Tradition is a Temple
Neberte nám princeznú
Beck – Pensionat Pärlan
Demobbed
The Spell of the Circus
Dar alanda kısa paslaşmalar
Bhagavad Gita
Farewell to the Duman River
Antigua vida mía
Naked Massacre
Union Pacific
The Light
Jack London
Jung
Titicut Follies
Combat Academy
The Green Hornet
Karishma Kudrat Ka
Mandela
Down the Road Again
Mulligans
Wiggle Time
The Horse Thief
The Notebooks of Memory
The Breaking of Bumbo
Parakrami
Odd Squad
A Sense of Entitlement
Anthem to Beauty
Jet Pink
The Brothers
The Game of Their Lives
Dobrynya Nikitich and Zmey Gorynych
Dog Days
Noah's Arc: Jumping the Broom
Sphodanam
Anandam
Eliza Fraser
Doctors' Wives
Spanish Movie
Aspiring Home Tutor: Soiled Pure Whiteness
Lasileuka
Ibo, o sangue do silêncio
The Plex
The Tsunami and the Cherry Blossom
Captain Spanky's Show Boat
Ricky Rapper
Two Small Bodies
Superdad
El Vient

In [35]:
a = 3907  # position of the term "a" in the index
documents = list(range(len(corpus)))
print(type(documents[0]))
for i in idx._dictionary[a].posting_list._postings:
    documents.remove(i._docID)
print(len(documents))
print(documents)

<class 'int'>
1558
[43, 73, 116, 122, 123, 137, 144, 162, 173, 184, 198, 280, 355, 373, 402, 405, 488, 513, 565, 598, 619, 644, 673, 688, 733, 742, 743, 752, 821, 847, 849, 921, 933, 955, 988, 992, 996, 1003, 1038, 1043, 1063, 1069, 1097, 1101, 1114, 1135, 1145, 1184, 1241, 1251, 1285, 1308, 1313, 1337, 1341, 1394, 1396, 1412, 1433, 1438, 1472, 1493, 1526, 1537, 1540, 1575, 1606, 1618, 1646, 1715, 1757, 1896, 1939, 2011, 2015, 2047, 2051, 2113, 2144, 2152, 2171, 2178, 2185, 2187, 2203, 2262, 2273, 2291, 2323, 2329, 2392, 2405, 2425, 2539, 2546, 2604, 2641, 2648, 2657, 2721, 2732, 2760, 2773, 2788, 2807, 2815, 2857, 2893, 2903, 2906, 3005, 3032, 3067, 3070, 3081, 3083, 3104, 3108, 3152, 3180, 3190, 3208, 3215, 3248, 3282, 3293, 3321, 3377, 3396, 3487, 3576, 3602, 3613, 3625, 3661, 3682, 3704, 3718, 3725, 3764, 3778, 3787, 3791, 3804, 3885, 3895, 3926, 3967, 3972, 4006, 4063, 4078, 4102, 4108, 4118, 4121, 4202, 4213, 4254, 4284, 4293, 4318, 4323, 4377, 4380, 4387, 4413, 4434, 4482, 4491,

In [36]:
documents = copy.deepcopy(corpus)
print(len(documents))
docs_delete = []
for i in idx._dictionary[a].posting_list._postings:
    docs_delete.append(documents[i._docID])

for i in docs_delete: # otherwise if you remove elements while you scan they shift and you remove the wrong ones!
    documents.remove(i)

#documents = list(set(documents))
print(f"len(documents): {len(documents)}, len(a_not_query): {len(a_not_query)}")
print(f"len(set(documents)): {len(set(documents))}, len(set(a_not_query)): {len(set(a_not_query))}")

assert len(a_not_query) == len(corpus) - len(idx._dictionary[a].posting_list)
# assert set(a_not_query) == set(documents) # Don't know why, but it fails
assert sorted(list(set(a_not_query))) == sorted(list(set(documents)))

42204
len(documents): 1558, len(a_not_query): 1558
len(set(documents)): 1557, len(set(a_not_query)): 1557


In [69]:
jj = [element for element in corpus if corpus.count(element) > 1]
print(jj)

print("corpus")
wv = []
rua = []
for i, c in enumerate(corpus):
    if c.title == "The Warrens of Virginia":
        wv.append(i)
        print("Found film 1")
    if c.title == "Robbery Under Arms":
        rua.append(i)
        print("Found film 2")
print(wv, rua)

corpus
Found film 2
Found film 1
Found film 2
Found film 1
Found film 2
[22000, 38445] [6076, 34064, 39314]


In [72]:
#contains_duplicates = any(documents.count(element) > 1 for element in documents)
j = [element for element in documents if documents.count(element) > 1]
print(j)

wv = []
rua = []
for i, d in enumerate(documents):
    if d.title == "The Warrens of Virginia":
        wv.append(i)
        print("Found film 1")
    if d.title == "Robbery Under Arms":
        rua.append(i)
        print("Found film 2")
print(wv, rua)

import collections
contains_duplicates = [item for item, count in collections.Counter(documents).items() if count > 1]
print(contains_duplicates)

seen = set()
duplicate = []
uniq = []
for x in documents:
    if x not in seen:
        uniq.append(x)
        seen.add(x)
    else:
        duplicate.append(x)
    if x.title == "The Warrens of Virginia":
        print("Found film 1")
    if x.title == "Robbery Under Arms":
        print("Found film 2")
print(duplicate)

[The Warrens of Virginia, Robbery Under Arms, The Warrens of Virginia, Robbery Under Arms]
Found film 1
Found film 2
Found film 1
Found film 2
[768, 1401] [1226, 1444]
[The Warrens of Virginia]
Found film 1
Found film 2
Found film 1
Found film 2
[The Warrens of Virginia]


In [71]:
collections.Counter(a_not_query) == collections.Counter(documents)

False

In [49]:
j_query = [element for element in a_not_query if a_not_query.count(element) > 1]
print(j_query)

contains_duplicates_query = [item for item, count in collections.Counter(a_not_query).items() if count > 1]
print(contains_duplicates_query)

seen_query = set()
uniq_query = [x for x in documents if x not in seen_query and not seen_query.add(x)] 
print(uniq)

[Robbery Under Arms, The Warrens of Virginia, The Warrens of Virginia, Robbery Under Arms]
[The Warrens of Virginia]


In [46]:
print(len(a_not_query), len(documents))
print(f"a_not_query == documents? {a_not_query == documents}")

set_query = set(a_not_query) 
set_docs = set(documents)
print(len(set_query), len(set_docs))
print(f"set_query == set_docs? {set_query == set_docs}")

u = set_query.union(set_docs)
print("u:", len(u), u)
i = set_query.intersection(set_docs)
print("i:", len(i), i)

nn = u.difference(i)
print("nn:", len(nn), nn)

d = set_docs.difference(set_query)
print("d:", len(d), d)
e = set_query.difference(set_docs)
print("e:", len(e), e)
print("d == e?", d == e)

e2 = sorted(list(e))
d2 = sorted(list(d))
print("sort(d) == sort(e)?", d2 == e2)

g = d.symmetric_difference(e)
print("g:", len(g), g)
print("nn == g?", nn == g)
p = sorted(list(g))
print("p=sort(g):", len(p), p)
print("sort(nn) == p?", sorted(list(nn)) == p)

f = set_query.symmetric_difference(set_docs)
print("f:", len(f), f)
o = sorted(list(f))
print("o:", len(o), o)
print("o[0] == o[1]?", o[0] == o[1])
print("set(o):", len(set(o)), set(o))
t = corpus.index(o[0])
v = corpus.index(o[1])
print(t, corpus[t], v, corpus[v])
print(corpus[t] == corpus[v])

print(set(f))

print("p == o?", p == o)

1558 1558
a_not_query == documents? False
1557 1557
set_query == set_docs? False
u: 1619 {Spooky Buddies, Bury Me Dead, The Thirst, My Dear Muthachan, Sinhasan, Jibon Thekey Neya, The Light, Flight to Mars, Spanish Movie, Napoleon Blown-Aparte, Jack London, Dossier K., Ashanti, Two Weeks, El Dorado, Pardon My Terror, Sahhas, The Jigsaw Man, Main Madhuri Dixit Banna Chahti Hoon, Socialism, Naked Massacre, The Christmas That Almost Wasn't, Titicut Follies, Café Cantante, Jihne Mera Dil Luteya, Madame Aema, Wedlock Deadlock, Holly Hobbie and Friends: Christmas Wishes, Tube, Musty Musketeers, Grand National Night, Dust, Вчера, Three Brothers, The Story of Two Women, Gangster Wars, Hum Se Badkar Kaun, Pretty Smart, Pauran, The Gentle Sex, Seduced By Madness, Jaal, The Fox of Glenarvon, Sara, Purple Heart, Dilwaala, A Woman's Vengeance, On with the Show, Flying G-Men, One Night, Achtung! - Auto-Diebe!, Murder C.O.D., Sh'Chur, Paul Goodman Changed My Life, Sayahnam, Little Orbit the Astrodog 

In [40]:
print(corpus[21].title)
print(corpus[21].description)
print()
print(corpus[43].title)
print(corpus[43].description)

Red's Dream
Set in a lonely city on a rainy night, the film takes place in a bicycle shop  that is closed for the night. In the corner of the shop sleeps Red, a red unicycle who languishes in the "clearance corner", waiting to be purchased. As the camera zooms on him, the sound of rain falling turns into a drumroll, and we go into the dream-sequence. In his dream, Red is being ridden by a circus clown  as part of a juggling act. The clown enters the ring, accompanied by a fanfare, expecting a huge applause, but instead receives only a few scattered claps from different parts of the  audience. Nevertheless, Lumpy starts juggling three balls whilst riding Red, occasionally dropping them as he does. However, Red slides out from underneath Lumpy  and spikes the balls back to him with his bike pedals. The confused clown ponders this for only a second before continuing on with his act. At this point, Red is forced to catch another ball which Lumpy unintentionally throws across the ring. Lump

In [41]:
lm_not_query = not_query(ir, "love mother", noprint=False)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
The Foxy Hunter
Samsaram adhu minsaram
Ellis in Glamourland
Bellyful
Hideous!
Fulltime Killer
Tarantula
Little Big Man
Ghost Sweepers
King of the Coral Sea
The Last Hurrah
G-Force
Bedroom Eyes
Lapland Odyssey
Lonelyhearts
Happy Gilmore
28 Days Later
In Search of Gregory
Ultraman Mebius & Ultraman Brothers
Fort Saganne
Tugboat Granny
I, Robot
Foxy by Proxy
Naval Commandos
My Friend Irma Goes West
Eegah
Escape to Athena
Shaolin Rescuers
Sliver
Bloodsport 3
The Thing Called Love
Wide Awake
Fat Pizza: The Movie
Vertical Limit
The Class of Nuke 'Em High
Heeron Ka Chor
Every Day Except Christmas
Hotel de Lux
Daughter of the Tong
A Fig Leaf for Eve
Daddy's Gone A-Hunting
Public Hero No. 1
Thriller: A Cruel Picture
Distance
Class of 1999 II: The Substitute
Catch as Cats Can
Path of Destruction
Underworld: Awakening
The Vigilante Fighting Hero of the West
Brothel
There Will Be No Leave Today
Liberty & Bash
Please Give
American Madn

In [42]:
corpus_set = set(corpus)
love_set = set(love_query)
l_not_set = corpus_set.intersection(love_set)

assert set(lm_not_query) == lm_not_set

NameError: ignored

### Compex queries

In [None]:
yAdOg_query = query(ir, "yoda AND darth OR Gandalf", noprint=False)

In [None]:
yd_and_set = yoda_set.intersection(darth_set)
yAdOg_set = yd_and_set.union(gandalf_set)

assert set(yAdOg_query) == yAdOg_set

In [None]:
yOdAg_query = query(ir, "yoda OR darth AND Gandalf", noprint=False)

In [None]:
yd_and_set = yoda_set.union(darth_set)
yOdAg_set = yd_and_set.intersection(gandalf_set)

assert set(yOdAg_query) == yOdAg_set

In [None]:
yOdOgAl_query = query(ir, "yoda OR darth OR Gandalf AND love", noprint=False)

In [None]:
ydg_and_set = yoda_set.union(darth_set).union(gandalf_set)
yOdOgAl_set = ydg_and_set.intersection(love_set)

assert set(yOdOgAl_query) == yOdOgAl_set

In [None]:
#yAdOgNl_query = query(ir, "yoda AND darth OR Gandalf NOT love", noprint=False)

In [None]:
yAdOgNl_set = yAdOg_set.difference(love_set)

#assert set(yAdOgNl_query) == yAdOgNl_set

In [None]:
#query(ir, "(yoda AND darth) OR Gandalf NOT love", noprint=False)