In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import nltk

import matplotlib.pyplot as plt
%matplotlib inline

from importlib import reload

from ReviewFeatureExtractor import ExtractBoW

### Hyperparameters

In [2]:
VOC_SIZE = 100000
REVIEW_CUTOFF_DATE = pd.Timestamp(2015, 12, 31)

### All Words

In [3]:
e = ExtractBoW(voc_size=VOC_SIZE)

e.load_word_vecs('data/wiki-news-300d-1M.vec')

### Need to iterate through each business and person

In [4]:
df_reviews = pd.read_pickle('results/tor_reviews.p')  # reviews
df_reviews['date'] = df_reviews['date'].astype('datetime64[ns]')
df_reviews = df_reviews[df_reviews['date'] > REVIEW_CUTOFF_DATE]

df_bus = pd.read_pickle('results/rest_univ_filter.p')  # businesses
df_users = pd.read_pickle('results/tor_users.p')  # users

### What is the distribution of review dates?

In [None]:
plt.hist(df_reviews['date'].values, bins=40)
plt.show()

In [None]:
print(df_bus.shape)
print(df_users.shape)
print(df_reviews.shape)

In [None]:
df_reviews.columns

### Businesses

In [None]:
res_dict = {}

for i, bus_id in tqdm(enumerate(df_bus.index)):
    reviews_df = df_reviews[df_reviews['business_id'] == bus_id]
    tot_reviews = reviews_df.shape[0]
    
    # print("now processing %s with %s reviews" % (bus_id, tot_reviews))
    
    avg_stars = reviews_df['stars'].mean()
    tot_cool = reviews_df['cool'].sum()
    tot_funny = reviews_df['funny'].sum()
    tot_useful = reviews_df['useful'].sum()
    
    texts = reviews_df['text'].values
    
    if tot_reviews > 0:
        vec_all = e.extract(texts)
        vec_n = e.extract(texts, word_type=e.WORD_TYPE_NOUN)
        vec_j = e.extract(texts, word_type=e.WORD_TYPE_ADJ)
        vec_v = e.extract(texts, word_type=e.WORD_TYPE_VERB)
    else:
        vec_all = np.zeros(e.loaded_embeddings.shape[1])
        vec_n = np.zeros(e.loaded_embeddings.shape[1])
        vec_j = np.zeros(e.loaded_embeddings.shape[1])
        vec_v = np.zeros(e.loaded_embeddings.shape[1])
        
    res_dict[bus_id] = {
        'tot_reviews': tot_reviews,
        'avg_stars': avg_stars,
        'tot_cool': tot_cool,
        'tot_funny': tot_funny,
        'tot_useful': tot_useful,
        'vec_all': vec_all,
        'vec_n': vec_n,
        'vec_j': vec_j,
        'vec_v': vec_v,
    }
    
    if i % 500 == 0:
        print("processed %s/%s businesses" % (i, len(df_bus.index)))
    
res_df = pd.DataFrame(res_dict).T

In [None]:
res_df.to_pickle('results/bus_review_log_extracts.p')

### Users

In [None]:
df_reviews.columns

In [None]:
res_dict = {}

for i, u_id in tqdm(enumerate(df_users.index)):
    reviews_df = df_reviews[df_reviews['user_id'] == u_id]
    tot_reviews = reviews_df.shape[0]
    
    # print("now processing %s with %s reviews" % (bus_id, tot_reviews))
    
    avg_stars = reviews_df['stars'].mean()
    tot_cool = reviews_df['cool'].sum()
    tot_funny = reviews_df['funny'].sum()
    tot_useful = reviews_df['useful'].sum()
    
    texts = reviews_df['text'].values
    
    if tot_reviews > 0:
        vec_all = e.extract(texts)
        vec_n = e.extract(texts, word_type=e.WORD_TYPE_NOUN)
        vec_j = e.extract(texts, word_type=e.WORD_TYPE_ADJ)
        vec_v = e.extract(texts, word_type=e.WORD_TYPE_VERB)
    else:
        vec_all = np.zeros(e.loaded_embeddings.shape[1])
        vec_n = np.zeros(e.loaded_embeddings.shape[1])
        vec_j = np.zeros(e.loaded_embeddings.shape[1])
        vec_v = np.zeros(e.loaded_embeddings.shape[1])
        
    res_dict[u_id] = {
        'tot_reviews': tot_reviews,
        'avg_stars': avg_stars,
        'tot_cool': tot_cool,
        'tot_funny': tot_funny,
        'tot_useful': tot_useful,
        'vec_all': vec_all,
        'vec_n': vec_n,
        'vec_j': vec_j,
        'vec_v': vec_v,
    }
    
    if i % 1000 == 0:
        print("processed %s/%s users" % (i, len(df_users.index)))
    
res_df = pd.DataFrame(res_dict).T
res_df.to_pickle('results/user_review_log_extracts.p')

In [None]:
res_df.head()

In [None]:
tot_rev_distr = res_df[res_df['tot_reviews'] > 0]['tot_reviews'].sort_values().values

In [None]:
tot_rev_distr

In [None]:
plt.plot(tot_rev_distr)
plt.show()

### On the review level, take cosine distance

In [5]:
def cos_sim(a, b):
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

#### Loop through reviews and get the cos_sim of the vectors for business ex that 1 review vs users ex that 1 review

In [6]:
dfr_ri = df_reviews.reset_index()

In [7]:
import logging

In [8]:
logging.basicConfig(filename='review_extract.log',level=logging.DEBUG, format='%(asctime)s %(message)s')

In [9]:
import pickle

In [None]:
res_dict = pickle.load(open('results/cur_cos_sim.p', 'rb'))
starting_i = len(res_dict)
save_every = 1000

#for i in tqdm(range(starting_i, dfr_ri.shape[0])):
for i in tqdm(range(0, dfr_ri.shape[0])):
    u_id = dfr_ri.loc[i]['user_id']
    bus_id = dfr_ri.loc[i]['business_id']
    review_id = dfr_ri.loc[i]['review_id']
    
    # user reviews ex business
    reviews_df = df_reviews[(df_reviews['user_id'] == u_id) & (df_reviews['business_id'] != bus_id)]
    tot_reviews = reviews_df.shape[0]
    texts = reviews_df['text'].values
    cur_user_vecs = []
    if tot_reviews > 0:
        cur_user_vecs.append(e.extract(texts))
        cur_user_vecs.append(e.extract(texts, word_type=e.WORD_TYPE_NOUN))
        cur_user_vecs.append(e.extract(texts, word_type=e.WORD_TYPE_ADJ))
        cur_user_vecs.append(e.extract(texts, word_type=e.WORD_TYPE_VERB))
        
        
    # business
    b_reviews_df = df_reviews[(df_reviews['business_id'] == bus_id) & (df_reviews['user_id'] != u_id)]
    b_tot_reviews = b_reviews_df.shape[0]
    b_texts = b_reviews_df['text'].values
    
    cur_bus_vecs = []
    if b_tot_reviews > 0:
        cur_bus_vecs.append(e.extract(b_texts))
        cur_bus_vecs.append(e.extract(b_texts, word_type=e.WORD_TYPE_NOUN))
        cur_bus_vecs.append(e.extract(b_texts, word_type=e.WORD_TYPE_ADJ))
        cur_bus_vecs.append(e.extract(b_texts, word_type=e.WORD_TYPE_VERB))

    
    cur_cos_sim = {}
    if tot_reviews > 0 and b_tot_reviews > 0:
        for j in range(0, 4):
            if np.linalg.norm(cur_user_vecs[j]) > 0 and np.linalg.norm(cur_bus_vecs[j]) > 0:
                cur_cos_sim[j] = cos_sim(cur_user_vecs[j], cur_bus_vecs[j])
            else:
                cur_cos_sim[j] = np.NaN
    else:
        for j in range(0, 4):
            cur_cos_sim[j] = np.NaN
        
    res_dict[review_id] = {
        'cos_sim_all': cur_cos_sim[0],
        'cos_sim_n': cur_cos_sim[1],
        'cos_sim_j': cur_cos_sim[2],
        'cos_sim_v': cur_cos_sim[3],
    }
    
    if (i + 1) % save_every == 0:
        pickle.dump(res_dict, open('results/cur_cos_sim.p', 'wb'))
        logging.info("processed row %s/%s, progress saved ..." % (i, dfr_ri.shape[0]))
    

HBox(children=(IntProgress(value=0, max=196768), HTML(value='')))

In [None]:
df_final = pd.DataFrame(res_dict).T
df_final.head()

In [None]:
2*60+8