# A Simple Boolean Retrieval System

In [1]:
from functools import total_ordering, reduce  # not essential but reduces the code we have to write
import csv   # for csv files
import re    # for regular expressions

## Postings

In [2]:
@total_ordering   # takes a class where we have defined at least the methods `eq` and `gt`/`lt` and defines in a consistent way all the other methods (otherwise we should implement them all by hand)
class Posting:
    
    def __init__(self, docID):
        """ Class constructor.
        """
        self._docID = docID
        
    def get_from_corpus(self, corpus):  # return from the corpus the doc corresponding to that docID. In the list you only save the docID, not the all document
        """ Returns the document corresponding to that docID from the corpus.
        """
        return corpus[self._docID]
    
    def __eq__(self, other):  # euqality comparator
        """ Performs the comparison between this posting and another one.
        Since the ordering of the postings is only given by their docID,
        they are equal when their docIDs are equal.
        """
        return self._docID == other._docID
    
    def __gt__(self, other):  # greather than comparator
        """ As in the case of __eq__, the ordering of postings is given
        by the ordering of their docIDs.
        """
        return self._docID > other._docID
    
    def __repr__(self):       # for debagging purposes to print the class
        """ String representation of the class.
        """
        return str(self._docID)

## Posting Lists

In [3]:
class PostingList:
    
    def __init__(self):
        """ Class constructor.
        """
        self._postings = []    # it has as an attribute a list of posting
        
    @classmethod     # to define another constructor. It will return another PostingList like a constructor
    def from_docID(cls, docID):
        """ A posting list can be constructed starting from a single docID.
        """
        plist = cls()
        plist._postings = [(Posting(docID))]
        return plist
    
    @classmethod
    def from_posting_list(cls, postingList):
        """ A posting list can also be constructed by using another posting list.
        """
        plist = cls()
        plist._postings = postingList   # we use it as the postins of this PostingList
        return plist
    
    def merge(self, other):  # we have to merge postinglists
        """ Merges the other posting list to this one in a desctructive
        way, i.e., modifying the current posting list. This method assumes
        that all the docIDs of the second list are higher than the ones
        in this list. It assumes the two posting lists to be ordered
        and non-empty. Under those assumptions duplicate docIDs are
        discarded.
        """
        i = 0
        last = self._postings[-1]   # the self eleemnt of the current postinglist
        while (i < len(other._postings) and last == other._postings[i]):  # we can have the same docID multiple times and when e merge them we don't want them multiple times
            i += 1
        self._postings += other._postings[i:]
        
    def intersection(self, other):
        """ Returns a new posting list resulting from the intersection
        of this one and the one passed as argument.
        """
        intersection = []
        i = 0
        j = 0
        while (i < len(self._postings) and j < len(other._postings)):  # until we reach the end of a posting list
            if (self._postings[i] == other._postings[j]):
                intersection.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                i += 1
            else:
                j += 1
        return PostingList.from_posting_list(intersection)
    
    def union(self, other):
        """ Returns a new posting list resulting from the union of this
        one and the one passed as argument.
        """
        union = []
        i = 0
        j = 0
        while (i < len(self._postings) and j < len(other._postings)):
            if (self._postings[i] == other._postings[j]):
                union.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                union.append(self._postings[i])   # because i is the smallest one
                i += 1
            else:
                union.append(other._postings[j]) 
                j += 1
        for k in range(i, len(self._postings)):  # we have to append the remaining elements of the non emptied list
            union.append(self._postings[k])
        for k in range(j, len(other._postings)):
            union.append(other._postings[k])
        return PostingList.from_posting_list(union)
    
    def get_from_corpus(self, corpus):   # used when we have a posting list that is the result of a query, but I don't want the docID, I want the docs!
        return list(map(lambda x: x.get_from_corpus(corpus), self._postings))  # I return a list of documents
    
    def __repr__(self):
        return ", ".join(map(str, self._postings))
    
PostingList.from_docID(10)   # creates a PostingL with the docID 10

10

## Terms

In [4]:
class ImpossibleMergeError(Exception):
    pass

@total_ordering  # to have all the ordering methods defined automatically
class Term:
    
    def __init__(self, term, docID):   # we create a term with a DocID, we sort them and we merge the equal terms
        self.term = term
        self.posting_list = PostingList.from_docID(docID)
        
    def merge(self, other):   # when we merge two terms
        """ Merges (destructively) this term and the corresponding posting list
        with another equal term and its corrsponding posting list.
        """
        if (self.term == other.term): # cannot merge posting lists with different terms!
            self.posting_list.merge(other.posting_list)  # merge the current posting list with the one of the other
        else: 
            raise ImpossibleMergeError # (some kind of error) error of impossible merge
            
    def __eq__(self, other):
        return self.term == other.term
    
    def __gt__(self, other):
        return self.term > other.term
    
    def __repr__(self):
        return self.term + ": " + repr(self.posting_list)

x = Term("cat", 3)
y = Term("cat", 6)
x.merge(y)
print(x)

cat: 3, 6


## Inverted Index

In [5]:
# We have to do some step of tokenization and normalization

def normalize(text):
    """ A simple funzion to normalize a text.
    It removes everything that is not a word, a space or an hyphen
    and downcases all the text.
    """
    no_punctuation = re.sub(r'[^\w^\s^-]', '', text)  # the text that matches a certain pattern will be substittuted with the second expression. ^\w → not something alphanumeric, ^\s → not some space, ^- → not a dash, replace it with '', the empty string
    downcase = no_punctuation.lower()  # put everything to lower case
    return downcase

def tokenize(movie):
    """ Returns a posting list from a movie description of all
    tokens present in the description.
    """
    text = normalize(movie.description)
    return list(text.split())

class InvertedIndex:
    
    def __init__(self):
        self._dictionary = []  # initially the inverted index dictionary is empty
        
    @classmethod  # instead of having this method associated to a specific instance/object of the class InvertedIndex we write InvertedIndex.from_corpus(). Because you can have only one __init__ method, so you use @classmethod to have multiple constructors. It's like a static method in Java
    def from_corpus(cls, corpus):
        intermediate_dict = {}   # we cheat a little bit and use a Python dictionary → we should create a big list, sort it and merge everything
        for docID, document in enumerate(corpus):
            tokens = tokenize(document)
            for token in tokens:
                term = Term(token, docID)
                try:
                    intermediate_dict[token].merge(term)  # I merge the two posting lists
                except KeyError:
                    intermediate_dict[token] = term # for when the term is not present in the dict
            if (docID % 1000 == 0):  # to see the progressing of our indexing
                print("ID: " + str(docID))
                
        idx = cls()  # we call the constructor of the class = InvertedIndex
        idx._dictionary = sorted(intermediate_dict.values())  # list of all the sorted terms
        return idx
    
    def __getitem__(self, key): # indexing the inverted index using as keys the terms
        for term in self._dictionary:  # we could do a binary search
            if term.term == key:
                return term.posting_list
        raise KeyError # the key is not present!
        
    def __repr__(self):
        return "A dictionary with " + str(len(self._dictionary)) + " terms"

## Reading the Corpus

It's the only part that will be dependant on the specific corpus, because of the specific format: when we have to read the movie metadata to obtain the title and then associate the title to the description of the movie.

The rest is independent from the specific corpus, which is actually how an IR system should be: except for few essential parts, the rest can be as independent as possible from the specific corpus.

In [6]:
class MovieDescription:  # container for all the info we have about the movie
    
    def __init__(self, title, description):
        self.title = title
        self.description = description
        
    def __repr__(self):
        return self.title

    
def read_movie_descriptions():
    filename = 'data/MovieSummaries/plot_summaries.txt'   # not very portable but done for the sake of simplicity
    movie_names_file = 'data/MovieSummaries/movie.metadata.tsv'
    with open(movie_names_file, 'r') as csv_file:
        movie_names = csv.reader(csv_file, delimiter = '\t')   # we define the csv reader
        names_table = {}   # Python dictionary with all the names of the films: key = movieID, value = movie title
        for name in movie_names:
            names_table[name[0]] = name[2] # the first element is the ID, the third elemnt is the title
    # Now we have all the associations between ID and title, we miss the move description

    with open(filename, 'r') as csv_file:
        descriptions = csv.reader(csv_file, delimiter = '\t')
        corpus = []   # collection (list) of objects of type MovieDescription
        for desc in descriptions:
            try:      # at least in this dataset there are some errors so some descriptions have not a matching ID
                movie = MovieDescription(names_table[desc[0]], desc[1]) # the first element is the ID, the second the description
                corpus.append(movie)
            except KeyError:  # in case we don't find the title associated to that ID
                pass
        return corpus

## Putting it all together

In [7]:
# Now we have to get a query from the user ...

class IRsystem:
    
    def __init__(self, corpus, index):
        self._corpus = corpus
        self._index = index
        
    @classmethod
    def from_corpus(cls, corpus): # generate the entire inverted index calling the constructor
        index = InvertedIndex.from_corpus(corpus)
        return cls(corpus, index)  # retrun the constructor when we have yet the index
    
    def answer_query(self, words):  # For now only queries with AND. We have a list of words like ['cat', 'batman']
        norm_words = map(normalize, words)  # Normalize all the words. IMPORTANT!!! If the user uses upper-case we will not have ANY match! We have to perform the same normalization of the docs in the corpus on the query!
        postings = map(lambda w: self._index[w], norm_words) # get the posting list for each word → list of posting lists
        plist = reduce(lambda x, y: x.intersection(y), postings)  # apply the function to the two items of the list, then apply it to the result with the third, then the result with the fourt term and so on until the end of the list
        return plist.get_from_corpus(self._corpus) # retrun the documents
    
def query(ir, text):
    words = text.split()
    answer = ir.answer_query(words)  # list of documents
    for movie in answer:
        print(movie)

## Examples

In [8]:
read_movie_descriptions()

[Taxi Blues,
 The Hunger Games,
 Narasimham,
 The Lemon Drop Kid,
 A Cry in the Dark,
 End Game,
 Dark Water,
 Sing,
 Meet John Doe,
 Destination Meatball,
 Husband for Hire,
 Up and Down,
 Ghost In The Noonday Sun,
 Exodus,
 House Party 2,
 Forest of the Damned 2,
 Charlie Chan's Secret,
 The Biggest Fan,
 Ashes to Ashes,
 Green Dragon,
 The Rats of Tobruk,
 Red's Dream,
 A la salida nos vemos,
 Class of '61,
 Come Back, Africa,
 Nee Sneham,
 Bhagwan Dada,
 A Merry Mixup,
 Kehtaa Hai Dil Baar Baar,
 Mr. & Mrs. '55,
 Samson and Delilah,
 Getting Even,
 Love, Mary,
 The Deep End of the Ocean,
 Pieces,
 River of No Return,
 Amici miei,
 Yankee Doodle Daffy,
 The Good Life,
 Mickey's Big Game Hunt,
 Red Cliff,
 The Good, the Bad, the Weird,
 Eastern Promises,
 The Dancing Fool,
 To Fili Tis Zois,
 No Entry,
 Tahaan,
 Men In The City,
 Bikini Beach,
 The Storm Riders,
 Against Her Will: An Incident in Baltimore,
 Milk and Honey,
 La Cité de la peur,
 Cheap Kisses,
 The Saddest Music in the

In [9]:
x = Posting(1)
y = Posting(2)
print(x < y)
print(x <= y)

True
True


In [10]:
normalize("e.g., this is A TeSt") # we remove punctuation and put everything to lowercase

'eg this is a test'

In [11]:
tokenize(MovieDescription("Title", "This is a movie Description."))

['this', 'is', 'a', 'movie', 'description']

In [12]:
corpus = read_movie_descriptions()

In [13]:
idx = InvertedIndex.from_corpus(corpus)

ID: 0
ID: 1000
ID: 2000
ID: 3000
ID: 4000
ID: 5000
ID: 6000
ID: 7000
ID: 8000
ID: 9000
ID: 10000
ID: 11000
ID: 12000
ID: 13000
ID: 14000
ID: 15000
ID: 16000
ID: 17000
ID: 18000
ID: 19000
ID: 20000
ID: 21000
ID: 22000
ID: 23000
ID: 24000
ID: 25000
ID: 26000
ID: 27000
ID: 28000
ID: 29000
ID: 30000
ID: 31000
ID: 32000
ID: 33000
ID: 34000
ID: 35000
ID: 36000
ID: 37000
ID: 38000
ID: 39000
ID: 40000
ID: 41000
ID: 42000


In [14]:
print(idx)

A dictionary with 194757 terms


In [15]:
idx['batman']  # all the docIDs containing 'batman' in the description!

334, 2990, 3463, 3519, 3545, 5510, 6854, 7105, 7358, 8467, 9503, 10360, 10727, 10933, 12458, 12492, 12967, 13095, 14199, 17366, 18875, 19381, 19675, 20598, 20808, 21070, 22147, 24393, 24484, 24658, 25866, 30601, 31272, 31508, 33213, 33638, 35356, 35980, 37238, 37389, 38152, 39092, 40499, 40596, 40821

In [16]:
ir = IRsystem(corpus, idx)

In [17]:
ir = IRsystem.from_corpus(corpus) # to re-index everything

ID: 0
ID: 1000
ID: 2000
ID: 3000
ID: 4000
ID: 5000
ID: 6000
ID: 7000
ID: 8000
ID: 9000
ID: 10000
ID: 11000
ID: 12000
ID: 13000
ID: 14000
ID: 15000
ID: 16000
ID: 17000
ID: 18000
ID: 19000
ID: 20000
ID: 21000
ID: 22000
ID: 23000
ID: 24000
ID: 25000
ID: 26000
ID: 27000
ID: 28000
ID: 29000
ID: 30000
ID: 31000
ID: 32000
ID: 33000
ID: 34000
ID: 35000
ID: 36000
ID: 37000
ID: 38000
ID: 39000
ID: 40000
ID: 41000
ID: 42000


In [18]:
query(ir, "frodo Gandalf")

The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
The Hunt for Gollum
The Return of the King
Date Movie
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Return of the King


In [19]:
query(ir, "yoda luke darth")

Star Wars Episode V: The Empire Strikes Back
Something, Something, Something Dark Side
Return of the Ewok
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
It's a Trap!
