In [1]:
!pip install nltk



In [2]:
import json
import nltk
import math
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer

In [3]:
class DocumentItem:
    """This class defines the structure of one touristic site"""
    def __init__(self, key, name, address, review_text):
        self.key = key
        self.name = name
        self.address = address
        self.review = review_text
        self.doc_len = 1

    """Set document length, note this length are counted in words and have stop words removed"""
    def set_doclen(self, doc_len):
        self.doc_len = doc_len

class DocumentCollection:
    """This class defines the structure that serves for all documents"""
    def __init__(self):
        self.doc_num = 0
        self.avg_dl = 0

    """Add one document length here, and document number and average length will be changed accordingly"""
    def add_doc(self, doc_len):
        self.doc_num += 1
        if self.doc_num > 0:
            self.avg_dl = (self.avg_dl * (self.doc_num - 1) + doc_len)/self.doc_num


In [4]:
class InvertedIndex:
    """Invert index data structure, meanwhile, update document length"""
    def __init__(self, tokenizer, stemmer=None, stopwords=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.index = defaultdict(dict)
        self.documents = dict()
        self.docCollection = DocumentCollection()
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(stopwords)

    """Calculate the BM25 score of one query to a certain document"""
    def bm25(self, query, document, k1=1.2, b=0.75):
        """Note the query are provided in a tokenized manner"""
        score = 0
        for word in query:
            nq = len(self.index[word])
            idf = math.log2((self.docCollection.doc_num - nq + 0.5)/(nq + 0.5))
            fqd = 0
            if document.key in self.index[word]:
                fqd = self.index[word][document.key]
            score += idf * fqd * (k1 + 1)/(fqd + k1 * (1 - b + b * document.doc_len/self.docCollection.avg_dl))
        return score

    """tokenize the query and run search for each token"""
    def search(self, query, maxshown=10):
        """first, process the query string"""
        query_words = self.tokenizer(query)
        query_word_set = set()
        for word in query_words:
            word = word.lower()
            if word in self.stopwords:
                continue
            if self.stemmer:
                word = self.stemmer.stem(word)
            query_word_set.add(word)

        """second, for each unique word,run query"""
        doc_key_set = set()
        for word in query_word_set:
            if len(doc_key_set) == 0:
                doc_key_set = set(self.index.get(word).keys())
            else:
                doc_key_set = doc_key_set.union(set(self.index.get(word).keys()))

        """retrieve document content"""
        doc_list = []
        for key in doc_key_set:
            bm25_score = self.bm25(query_word_set, self.documents[key])
            doc_list.append([self.documents[key], bm25_score])

        """Sort by bm25 score"""
        doc_list.sort(key=lambda x: x[1], reverse=True)
        if len(doc_list) >= maxshown:
            return doc_list[0: maxshown]
        else:
            return doc_list

    """indexing a document item"""
    def add(self, document):
        """Retrieve document key and content"""
        doc_key = document.key
        # doc_name = document.name
        doc_text = document.review
        doc_len = 0

        for token in self.tokenizer(doc_text):
            token = token.lower()
            if token in self.stopwords:
                continue

            doc_len += 1
            if self.stemmer:
                token = self.stemmer.stem(token)

            if doc_key not in self.index[token]:
                self.index[token][doc_key] = 1
            else:
                self.index[token][doc_key] += 1

        self.documents[doc_key] = document
        document.set_doclen(doc_len)
        self.docCollection.add_doc(doc_len)

In [5]:
"""parse the review documents into documentItems"""
class JsonParser:
    def __init__(self, filename):
        self.filename = filename
        self.doc_list = []
        self.doc_idx = 0

    def parse(self):
        with open(self.filename, 'rb') as f:
            self.doc_list = json.load(f)

    def has_more_item(self):
        return self.doc_idx < len(self.doc_list)

    def get_next_item(self):
        if len(self.doc_list) == 0:
            print("Error: need to parse before getitem")
            return
        if self.doc_idx >= len(self.doc_list):
            print("Error: index has exceeded maximum item numbers")
            return
        item_raw = self.doc_list[self.doc_idx]
        item_key = item_raw['place_id']
        item_name = item_raw['name']
        item_address = item_raw['formatted_address']
        item_text = ""
        for item_rev in item_raw['reviews']:
            if "language" in item_rev.keys() and item_rev["language"].lower() != "en":
                continue
            item_text += item_rev["text"]
            item_text += " "
            # I guess we could add some filter here for the "time" attribute

        self.doc_idx += 1
        return DocumentItem(item_key, item_name, item_address, item_text)

In [6]:
''' Test Step 1, build reverse indexing object'''
stop_words = set()
try:
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english'))

invindex = InvertedIndex(word_tokenize, 
    EnglishStemmer(), 
    stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
''' Test Step 2, create json parser on top of one json file'''
json_parser = JsonParser("PlacesResults.json")
json_parser.parse()

In [12]:
''' Test Step 3, Create reverse indexing for the json file'''
while json_parser.has_more_item():
    doc_item = json_parser.get_next_item()
    if doc_item != None:
        invindex.add(doc_item)        

In [15]:
''' Test Step 4, do sample query'''
query_str = "child friendly restaurant"
max_shown = 8
show_counter = 0
query_doc_list = invindex.search(query_str, max_shown)
for query_item in query_doc_list:
    query_doc_item = query_item[0]
    print("=" * 20 + " Result " + str(show_counter) + " " + "="*20)
    print("BM25 score: " + str(query_item[1]))
    print("name: " + query_doc_item.name)
    print("address: " + query_doc_item.address)
    print("review: " + query_doc_item.review)
    print("")
    show_counter += 1
 

BM25 score: 15.763133230965739
name: Lunchbox
address: 1612 Forest Ave, Staten Island, NY 10302, USA
review: This place is overrated and over priced!!! The sandwich big boy cheeseburger is misleading. When u go in a store & see a sandwich named big boy & the price....1 will assume this will be a nice size sandwich. NOT!!!! It's a small little cheeseburger suitable for a child! A box full of curly fries & 2 onion rings. .... FYI..I KNOW UR A TOP RATED RESTAURANT but this is my opinion. This place is AMAZING! Everyone who works there is so accommodating and friendly. The food is absolutely DELICIOUS! There's so much on this menu and I can't wait to go back to try something else. They also deliver across the entire island which isn't common got most restaurants. If you're thinking about trying it, just go! You will not regret it! (Get the tater tot tower!) LOVE having an outdoor lunch during gorgeous weather days with My Husband and Lunchbox is a perfect spot!  Their Fish and Chips is exc