In [2]:
import pickle
#from utils import * 
import pandas as pd
import numpy as np
import json
import nltk
#nltk.download()

#import seaborn as sns
#import matplotlib.pyplot as plt

In [4]:
def json_df(datapass):
    '''
    Load the json file and parse the file to pandas dataframe format
    
    Input:
        datapass(str) : directory to the json file
    Output:
        df(dataframe) : pandas dataframe object
    '''
    
    data = [] 
    with open(datapass, 'r', encoding='utf-8') as data_file: 
        for f in data_file:
            data.append(json.loads(f))
    df = pd.DataFrame(data)
    return df

#business and review json files --> dataframes

DATAPASS1 = 'C:\\Users\\cdchang\\Downloads\\yelp_dataset\\dataset\\business.json'
DATAPASS2 = 'C:\\Users\\cdchang\\Downloads\\yelp_dataset\\dataset\\review.json'

#business = json_df(DATAPASS1)
reviews = json_df(DATAPASS2)

In [5]:
def display_topics(model, feature_names, n_top_words):
    '''
    display topic with n_top_words in a decsending order of weight
    
    Input:
        model : directory for vectorizer
        feature_names : list of vocabulary 
        n_top_words(int) : number of words to display for each topic

    '''
    for topic_index, topic in enumerate(model.components_):
        print("Topic %d:" % topic_index)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words:-1]]))

def load_topic_model(vectorizer_file_name, topic_model_file_name):
    with open(vectorizer_file_name, "rb") as f:
        vectorizer = pickle.load(f)
    with open(topic_model_file_name, "rb") as f:
        topic_model = pickle.load(f)
    return vectorizer, topic_model

#pizza

path1 = 'C:\\Users\\cdchang\\Desktop\\yelp-topic-model\\model\\pizza_tfidf_vectorizer.pkl'
path2 = 'C:\\Users\\cdchang\\Desktop\\yelp-topic-model\\model\\pizza_nmf.pkl'
path3 = 'C:\\Users\\cdchang\\Desktop\\yelp-topic-model\\model\\pizza_count_vectorizer.pkl'
path4 = 'C:\\Users\\cdchang\\Desktop\\yelp-topic-model\\model\\pizza_lda.pkl'
vec, tm = load_topic_model(path1, path2)
vec2, tm2 = load_topic_model(path3, path4)

In [6]:
tm.components_

array([[  5.35318219e-02,   0.00000000e+00,   4.09802948e-02, ...,
          7.96897006e-02,   3.98538679e-02,   0.00000000e+00],
       [  7.86209423e-02,   1.88202333e-01,   3.27705625e-02, ...,
          2.97546372e-02,   9.73259392e-02,   0.00000000e+00],
       [  3.54345514e-01,   2.62719244e+00,   1.12437870e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  4.13455229e-02,   0.00000000e+00,   4.27813136e-04, ...,
          0.00000000e+00,   6.59642074e-02,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          3.22001617e-02,   1.51870707e-01,   0.00000000e+00],
       [  1.37329447e-02,   0.00000000e+00,   7.26685010e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [7]:
#k index embedding methods

def get_k_index(sentence, vec, tm):
    '''
    Given a sentence (not the entire review), generate a distribution of topic 
    by adding the weight of topic per words.
    I initialy tried with the product of topic weights but ended up getting 0s.
    
    Also produce the probability vector by scaling the vector so that the sum will be 1.
    
    Input:
        sentence(str) : a sentence to be studied
        vec : vectorizer object 
        tm : topic model object 
    Output:
        k_index (list) : the vector of length k (k = number of topic). 
        prob (list) : k_index but scaled so that the sum of inputs becomes 1 
    '''
    analyzer = vec.build_analyzer()
    topic_words = vec.get_feature_names()
    tm_mat = tm.components_
    k_index = np.array([0.]*tm_mat.shape[0])
    for word in analyzer(sentence):
        if word in topic_words:
            k_index += tm_mat[:, topic_words.index(word)]
    if sum(k_index) == 0:
        prob = np.array([0.]*tm_mat.shape[0])
    else:
        prob = k_index/(sum(k_index))
    return k_index, prob


def get_avg_k_index_for_doc(doc, vec, tm):
    sent_text = nltk.sent_tokenize(doc)
    if (int(len(sent_text)) == 0):
        return []
    else:
        tm_mat = tm.components_
        k_index = np.array([0.]*tm_mat.shape[0])
        for s in sent_text:
            k, p = get_k_index(s, vec, tm)
            k_index += p
        return k_index/int(len(sent_text))

#use this 
def business_id_retrieval(cat, business):
    '''
    Input:
        cat(str) : category
        business(dataframe) : the business data
    Output:
        id_list(set) : business ids of a particular category
    '''
    id_list = set()
    id_list = []
    idx = 0
    for row in business.values:
        categories = row[3]
        if cat in categories:
            id_list.append(row[2])
            
    return id_list

#business_id_retrieval('Pizza', business)

#get_avg_k_index_for_doc("uYHaNptLzDLoV_JZ_MuzUA",vec, tm)

In [35]:
#ultimately: find subset of pizza / chinese restaurants 
#set-up methods

#for a single business_id
def examine_reviews(b_id, reviews):
    '''
    concatenate a particular business's review text with the corresponding dates for time series analysis
    
    Input: specific business id, review json file
    Output: concatenated review text & date columns for a particular business id
    '''
    subset = reviews.loc[reviews['business_id'] == b_id]
    date = subset.loc[:, 'date']
    text = subset.loc[:, 'text']
    return pd.concat([date, text], axis=1, join='inner')

#trying with single business id / not tested
def k_index_emb_single(b_id, reviews):
    
    '''
    input: business id, reviews json
    output: weight vectors for each topic, for every review from a particular business, concatenated (sorted by time)
    '''
    
    subset = examine_reviews(b_id, reviews)
    s = subset.sort_values(by="date") #sorting by timestamp
    text_only = s.loc[:, 'text']
    count= 0
    weight_vectors = []
    for t in text_only:
        if (count==0):
            weight_vectors.append(get_avg_k_index_for_doc(t,vec,tm))
        else:
            newcol = get_avg_k_index_for_doc(t,vec,tm)
            weight_vectors = np.vstack([weight_vectors, newcol])
        count = count+1    
    return weight_vectors

def examine_reviews2(cat, business, reviews):
    '''
    concatenate a category's review text with the corresponding dates for time series analysis
    Input: specific category, review json file
    Output: concatenated review text & date columns for a particular category (not business specific) 
    '''
    id_list = business_id_retrieval(cat, business)
    return reviews.loc[reviews['business_id'].isin(id_list)]

#sort by timestamp
subset = examine_reviews("HRFJlSAP_EBU_MpPPmpUDQ", reviews)
s = subset.sort_values(by="date") #sorting by timestamp

weight_vectors = k_index_emb_single('HRFJlSAP_EBU_MpPPmpUDQ', reviews)
topic_points = list()

for wv in weight_vectors:
    topic_points.append(weight_vectors[wv,0])
plt.plot(topic_points,s.loc[:,'date'], 'k-', lw=2)

IndexError: arrays used as indices must be of integer (or boolean) type

In [None]:
#find trend
import matplotlib as plt
def plot_per_topic(b_id, reviews, topic_num):
    weight_vectors = k_index_emb_single(b_id, reviews)
    text_only = s.loc[:, 'text']