In [None]:
!pip install nltk

In [None]:
import json
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer

In [None]:
class docItem:
    '''This defines the structure of one touristic site'''
    def __init__(self, key, name, address, review_text):
        self.key = key
        self.name = name
        self.address = address
        self.review = review_text

In [None]:
class reverseIndex:
    '''Invert index data structure'''
    def __init__(self, tokenizer, stemmer=None, stopwords=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.index = defaultdict(list)
        self.documents = {}
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(stopwords)
    
    '''tokenize the query and run search for each token'''
    def search(self, query):        
        '''first, process the query string'''
        query_words = self.tokenizer(query)
        query_word_set = set()
        for word in query_words:
            word = word.lower()
            if word in self.stopwords:
                continue
            if self.stemmer:
                word = self.stemmer.stem(word)
            query_word_set.add(word)

        '''second, for each unique word,run query'''
        doc_key_set = set()
        for word in query_word_set:
            doc_key_set = doc_key_set.union(set(self.index.get(word)))
            
        '''retrieve document content and return'''
        doc_list = []
        for key in doc_key_set:
            doc_list.append(self.documents[key])
    
        return doc_list

    '''indexing an document'''
    def add(self, document):
        '''Retrieve document key and content'''
        doc_key = document.key
        doc_name = document.name
        doc_text = document.review
        
        for token in self.tokenizer(doc_text):
            token = token.lower()
            if token in self.stopwords:
                continue
            
            if self.stemmer:
                token = self.stemmer.stem(token)
            
            if doc_key not in self.index[token]:
                self.index[token].append(doc_key)
            
            self.documents[doc_key] = document


In [None]:
'''parse the review documents into docItems'''
class jsonParser:    
    def __init__(self, filename):
        self.filename = filename
        self.doc_list = []
        self.doc_idx = 0
    
    def parse(self):
        with open(self.filename, 'rb') as f:
            self.doc_list = json.load(f)
    
    def has_more_item(self):
        return self.doc_idx < len(self.doc_list)
    
    def get_next_item(self):
        if len(self.doc_list) == 0:
            print("Error: need to parse before getitem")
            return
        if self.doc_idx >= len(self.doc_list):
            print("Error: index has exceeded maximum item numbers")
            return
        item_raw = self.doc_list[self.doc_idx]
        item_key = item_raw['place_id']
        item_name = item_raw['name']
        item_address = item_raw['formatted_address']
        item_text = ""
        for item_rev in item_raw['reviews']:            
            if "language" in item_rev.keys() and item_rev["language"].lower() != "en":
                continue
            item_text += item_rev["text"]
            item_text += " "
            # I guess we could add some filter here for the "time" attribute

        self.doc_idx += 1        
        return docItem(item_key, item_name, item_address, item_text)   
    

In [None]:
''' Test Step 1, build reverse indexing object'''
stop_words = set()
try:
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english'))

revindex = reverseIndex(word_tokenize, 
    EnglishStemmer(), 
    stop_words)

In [None]:
''' Test Step 2, create json parser on top of one json file'''
json_parser = jsonParser("PlacesResults.json")
json_parser.parse()

In [None]:
''' Test Step 3, Create reverse indexing for the json file'''
while json_parser.has_more_item():
    doc_item = json_parser.get_next_item()
    if doc_item != None:
        revindex.add(doc_item)        

In [None]:
''' Test Step 4, do sample query'''
query_str = "japanese food delicious"
query_doc_list = revindex.search(query_str)
max_shown = 10
show_counter = 1
for query_doc_item in query_doc_list:
    print("=" * 40)
    print("name: " + query_doc_item.name)
    print("address: " + query_doc_item.address)
    print("review: " + query_doc_item.review)
    print("")
    show_counter += 1
    if show_counter > max_shown:
        break
