In [1]:
from collections import defaultdict
from collections import Counter
import json
import math
import string
import time
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from IPython.core.display import HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import re

In [2]:
with open("finalData.json", "r") as f:
    data = json.load(f)

In [3]:
def build_vectorizer():
    """Returns a TfidfVectorizer object with certain preprocessing properties.
    
    Params: {max_n_terms: Integer,
             max_prop_docs: Float,
             min_n_docs: Integer}
    Returns: TfidfVectorizer
    """
    return TfidfVectorizer(stop_words = 'english')

In [5]:
word_splitter = re.compile(r"""
    (\w+)
    """, re.VERBOSE)

def getwords(sent):
    return [w.lower() 
            for w in word_splitter.findall(sent)]

In [6]:
tfidf_vec = build_vectorizer()
stemmer=PorterStemmer()
reviews = []
for city, city_dic in data.items():
    for restaurant, restaurant_dic in city_dic.items():
        for review in restaurant_dic['reviews']:
            all_words = getwords(review['text'])
            stem_text = [stemmer.stem(t.lower()) for t in all_words]
            reviews.append(" ".join(stem_text))

In [7]:
print(len(reviews))

105802


In [8]:
tfidf_mat = tfidf_vec.fit_transform(reviews).toarray()

In [9]:
print(tfidf_mat[0])

[0. 0. 0. ... 0. 0. 0.]


In [10]:
print(tfidf_vec.get_feature_names())

['00', '000', '0000000', '00000000001', '00000000334229067', '000003', '00025', '0004489a91bbfbea77dd', '000th', '000x', '001', '005', '007', '00_', '00a', '00after', '00am', '00ea', '00ish', '00noon', '00p', '00per', '00pm', '00pn', '01', '0100', '0107', '01126373001', '0122', '013', '014', '015', '018', '01803', '01864', '01867', '01923', '01am', '01pm', '02', '02026', '02045', '02062', '02108', '02115', '02116', '02119', '02121', '02122', '02128', '02129', '02132', '02134', '02135', '02143', '02150', '02155', '02174', '02176', '02458', '02464', '02472', '02474', '02482', '0278', '02pm', '03', '034', '03479175', '036', '039', '03am', '03pm', '04', '041', '042', '043', '04a', '04pm', '04xg', '05', '051', '052', '0520', '053', '054', '055', '056', '057', '05am', '05p', '05pm', '06', '060219', '064ntwg', '067', '06_le_ic', '06am', '06may2018', '06pm', '07', '070219', '0707', '0716', '07am', '07jul', '07pm', '08', '0800', '0820', '0830', '0840', '0882', '08am', '08pm', '09', '0905', '091

In [11]:
print(len(tfidf_vec.get_feature_names()))

60940


In [12]:
def build_movie_sims_cos(num_movies, input_doc_mat):
    """Returns a matrix of size num_movies x num_movies where for (i,j), entry [i,j]
       should be the cosine similarity between the movie with index i and the movie with index j
        
    Note: All movies are trivially perfectly similar to themselves, so the diagonals of the output matrix should be 1.
    
    Params: {num_movies: Integer,
             input_doc_mat: Numpy Array,
             movie_index_to_name: Dict,
             movie_name_to_index: Dict,
             input_get_sim_method: Function}
    Returns: Numpy Array 
    """
    cos_sim = np.zeros((num_movies, num_movies))
    norms = np.linalg.norm(input_doc_mat, axis=1)
    for i in range(num_movies):
        for j in range(num_movies):
            cos_sim[i][j] = np.dot(input_doc_mat[i], input_doc_mat[j])/(norms[i] * norms[j])
    return cos_sim

In [None]:
sims_cos = build_movie_sims_cos(len(reviews), tfidf_mat)