In [4]:
# Exploring word embedding vectors using Word2Vec 
  
# import necessary modules 
import json
import nltk
import math
import warnings
import gensim
import logging

from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import EnglishStemmer
from gensim.models import Word2Vec 
warnings.filterwarnings(action = 'ignore') 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
#  Define Classes
class DocumentItem:
    """This class defines the structure of one touristic site"""
    def __init__(self, key, name, address, review_text):
        self.key = key
        self.name = name
        self.address = address
        self.review = review_text
        self.doc_len = 1

    """Set document length, note this length are counted in words and have stop words removed"""
    def set_doclen(self, doc_len):
        self.doc_len = doc_len

class DocumentCollection:
    """This class defines the structure that serves for all documents"""
    def __init__(self):
        self.doc_num = 0
        self.avg_dl = 0

    """Add one document length here, and document number and average length will be changed accordingly"""
    def add_doc(self, doc_len):
        self.doc_num += 1
        if self.doc_num > 0:
            self.avg_dl = (self.avg_dl * (self.doc_num - 1) + doc_len)/self.doc_num

In [None]:
data_file = ""

In [8]:
#parsing JSON File

"""parse the review documents into documentItems"""
class JsonParser:
    def __init__(self, filename):
        self.filename = filename
        self.doc_list = []
        self.doc_idx = 0

    def parse(self):
        with open(self.filename, 'rb') as f:
            self.doc_list = json.load(f)

    def has_more_item(self):
        return self.doc_idx < len(self.doc_list)

    def get_next_item(self):
        if len(self.doc_list) == 0:
            print("Error: need to parse before getitem")
            return
        if self.doc_idx >= len(self.doc_list):
            print("Error: index has exceeded maximum item numbers")
            return
        item_raw = self.doc_list[self.doc_idx]
        item_key = item_raw['place_id']
        item_name = item_raw['name']
        item_address = item_raw['formatted_address']
        item_text = ""
        for item_rev in item_raw['reviews']:
            if "language" in item_rev.keys() and item_rev["language"].lower() != "en":
                continue
            item_text += item_rev["text"]
            item_text += " "
            # I guess we could add some filter here for the "time" attribute

        self.doc_idx += 1
        return DocumentItem(item_key, item_name, item_address, item_text)

''' Test Step 2, create json parser on top of one json file'''
json_parser = JsonParser("PlacesResults.json")
json_parser.parse()

In [None]:
# Replaces escape character with space 

#f = s.replace("\n", " ") 
#data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 
  
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 

'''
# Print results 
print("Cosine similarity between 'alice' " + 
               "and 'wonderland' - CBOW : ", 
    model1.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
                 "and 'machines' - CBOW : ", 
      model1.similarity('alice', 'machines')) 
  
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 1) 
  
# Print results 
print("Cosine similarity between 'alice' " +
          "and 'wonderland' - Skip Gram : ", 
    model2.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'alice' " +
            "and 'machines' - Skip Gram : ", 
      model2.similarity('alice', 'machines')) 
'''
