In [37]:
import re

In [2]:
with open("sample.txt", "r") as input:
    input_ = input.read().split("\n\n")   #\n\n denotes there is a blank line in between paragraphs.

In [3]:
input_

['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce tristique tellus dolor, porta dapibus lacus feugiat eu. Mauris non sem neque. Curabitur et neque diam. Sed elementum vitae lacus sit amet sodales. Quisque metus dui, sodales ac mollis quis, facilisis quis sapien. Curabitur sed placerat lorem. Pellentesque scelerisque ultricies elementum. Maecenas arcu massa, fringilla a odio non, ullamcorper aliquet lectus. Morbi eu odio eget enim blandit scelerisque. Aenean vitae nibh blandit velit ultrices consequat. Nam rutrum arcu magna, a auctor erat eleifend at. Nullam consequat ante ante, quis ultrices est lobortis congue. Etiam nec laoreet velit.',
 'Nullam nec odio urna. Ut sed felis congue, imperdiet enim quis, tempus metus. Suspendisse lobortis eu ligula sit amet pellentesque. Curabitur pulvinar ullamcorper mattis. Maecenas neque sapien, congue vel orci vitae, porta cursus magna. Donec felis velit, venenatis sit amet volutpat nec, tristique eu massa. Cras tincidunt interdum lor

In [57]:
class Appearance:
    """
    Represents the appearance of a term in a given document, along with the
    frequency of appearances in the same one.
    """
    def __init__(self, docId, frequency):
        self.docId = docId
        self.frequency = frequency
        
    def __repr__(self):
        """
        String representation of the Appearance object
        """
        return str(self.__dict__)

In [100]:
class Database:
    """
    In memory database representing the already indexed documents.
    """
    def __init__(self):
        self.db = dict()
        
    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.__dict__)
    
    def get(self, id):
        return self.db.get(id, None)
    
    def add(self, document):
        """
        Adds a document to the DB.
        """
        return self.db.update({document['id']: document})
    
    def remove(self, document):
        """
        Removes document from DB.
        """
        return self.db.pop(document['id'], None)
    
    def clear(self):
        self.db.clear()
        return None

In [101]:
class InvertedIndex:
    """
    Inverted Index class.
    """
    def __init__(self, db):
        self.index = dict()
        self.db = db
        
    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.index)
        
    def index_document(self, document):
        """
        Process a given document, save it to the DB and update the index.
        """
        
        # Remove punctuation from the text.
        clean_text = re.sub(r'[^\w\s]','', document['text'])
        clean_text = clean_text.lower()
        terms = clean_text.split(' ')
        
        appearances_dict = dict()
        # Dictionary with each term and the frequency it appears in the text.
        for term in terms:
            term_frequency = appearances_dict[term].frequency if term in appearances_dict else 0
            appearances_dict[term] = Appearance(document['id'], term_frequency + 1)
            
        # Update the inverted index
        update_dict = { key: [appearance]
                       if key not in self.index
                       else self.index[key] + [appearance]
                       for (key, appearance) in appearances_dict.items() }
        self.index.update(update_dict)
        # Add the document into the database
        self.db.add(document)
        return document
    
    def clear_index(self, document):
#         print(type(self.db.db['1']))
        self.db.remove(document)
        
    
    def lookup_query(self, query):
        """
        Returns the dictionary of terms with their correspondent Appearances. 
        This is a very naive search since it will just split the terms and show
        the documents where they appear.
        """
        query = query.lower()
        if query in self.index:
            print('aye')
#             print(self.index[query])
            return(self.index[query])
        else:
            print('naah')
            return []
#         return { term: self.index[term] for term in query.split(' ') if term in self.index }

In [102]:
db = Database()
index = InvertedIndex(db)
for idx,doc in enumerate(input_):
    dict_ = {
        'id': str(idx+1),
        'text': doc
    }
    index.index_document(dict_)

In [41]:
res_ = index.lookup_query('Maecenas')

aye


In [67]:
res = sorted(res_, key=lambda k: k.frequency, reverse=True)

for doc in res:
    print(db.get(doc.docId)['text'])
    print()

Sed volutpat sapien vitae mauris posuere, non scelerisque massa maximus. Phasellus nec eros neque. Nam blandit velit a mi semper, id blandit arcu consectetur. Donec sit amet est nec eros aliquam vehicula at sed est. Ut iaculis iaculis lectus in ullamcorper. Quisque enim sapien, condimentum semper sodales eu, malesuada id erat. Donec rhoncus molestie leo, at blandit neque posuere vitae. Curabitur varius libero vitae sem consectetur interdum. Vivamus suscipit erat ac vestibulum vestibulum. Maecenas Maecenas Maecenas Maecenas urna nisi, semper at ornare eget, efficitur pharetra tortor.

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce tristique tellus dolor, porta dapibus lacus feugiat eu. Mauris non sem neque. Curabitur et neque diam. Sed elementum vitae lacus sit amet sodales. Quisque metus dui, sodales ac mollis quis, facilisis quis sapien. Curabitur sed placerat lorem. Pellentesque scelerisque ultricies elementum. Maecenas arcu massa, fringilla a odio non, ullamcorper al

In [37]:
def highlight_term(id, term, text):
    replaced_text = text.replace(term, "\033[1;32;40m {term} \033[0;0m".format(term=term))
    return "--- document {id}: {replaced}".format(id=id, replaced=replaced_text)

def main():
    db = Database()
    index = InvertedIndex(db)
    document1 = {
        'id': '1',
        'text': 'The big sharks of Belgium drink beer.'
    }
    document2 = {
        'id': '2',
        'text': 'Belgium has great beer. They drink beer all the time.'
    }
    index.index_document(document1)
    index.index_document(document2)
    
    
    search_term = input("Enter term(s) to search: ")
    result = index.lookup_query(search_term)
    
    for term in result.keys():
        for appearance in result[term]:
            # Belgium: { docId: 1, frequency: 1}
            document = db.get(appearance.docId)
            print(highlight_term(appearance.docId, term, document['text']))
        print("-----------------------------")

In [21]:
main()

TypeError: '_io.TextIOWrapper' object is not callable

In [105]:
db.clear()

In [94]:
db.remove

In [106]:
db

{'db': {}}