In [489]:
import random
## kMedoids
def assign_points_to_clusters(medoids, distances):
    distances_to_medoids = distances[:,medoids]
    clusters = medoids[np.argmin(distances_to_medoids, axis=1)]     # horizontal, return minimum value's index
    clusters[medoids] = medoids
    return clusters

def compute_new_medoid(cluster, distances):
    mask = np.ones(distances.shape)
    mask[np.ix_(cluster,cluster)] = 0.
    cluster_distances = np.ma.masked_array(data=distances, mask=mask, fill_value=10e9)
    costs = cluster_distances.sum(axis=1)
    return costs.argmin(axis=0, fill_value=10e9)
def cluster(distances, k):
    m = distances.shape[0] # number of points

    # Pick k random medoids.
    curr_medoids = np.array([-1]*k)
    while not len(np.unique(curr_medoids)) == k:
        curr_medoids = np.array([random.randint(0, m - 1) for _ in range(k)])
    old_medoids = np.array([-1]*k) # Doesn't matter what we initialize these to.
    new_medoids = np.array([-1]*k)
   
    # Until the medoids stop updating, do the following:
    while not ((old_medoids == curr_medoids).all()):
        # Assign each point to cluster with closest medoid.
        clusters = []
        clusters = assign_points_to_clusters(curr_medoids, distances)
        # Update cluster medoids to be lowest cost point. 
        for curr_medoid in curr_medoids:
            cluster = np.where(clusters == curr_medoid)[0]
            new_medoids[curr_medoids == curr_medoid] = compute_new_medoid(cluster, distances)
        old_medoids[:] = curr_medoids[:]
        curr_medoids[:] = new_medoids[:]
    return curr_medoids

In [523]:
from sklearn.cluster import DBSCAN
from collections import Counter
import numpy as np

def get_predict(businessid):
    test=res_review["text"][res_review.business_id==businessid]
    l=len(test)
    if l<60:
        minspl=3
        eps=0.25
    elif l<180:
        minspl=5
        eps=0.25
    elif l<500:
        minspl=8
        eps=0.25
    elif l<1000:
        minspl=10
        eps=0.25
    elif l<2000:
        minspl=12
        eps=0.25
    else:
        minspl=15
        eps=0.25
    return prediction(test,eps,minspl)

def prediction(test,eps,minspl):
    ##get data 
    p1=[]
    for s in test:
        p1.append(generate_candidate_phrases(s,stop))
    words=sum(p1,[])        # transfering phrases from 2D array to 1D array
    ##remove uncommon words
    candidates=[]
    for a in words:
        if a in model.wv:
            candidates.append(a)

    ##calculate similarity
    sim=[[0]*len(candidates) for _ in range(len(candidates))]    # create 0-based 2D array
    for i in range(len(candidates)):
        for j in range(i,len(candidates)):
            d=model.wv.similarity(candidates[i],candidates[j]) 
            d = (1 if d > 1 else d)
            d = (abs(d) if d<0 else d)
            sim[i][j] = d
            sim[j][i] = d
    sim=np.array(sim)
    dis = sim.copy()
    dis = 1. - dis
    
    ##DBSCAN
    db = DBSCAN(metric="precomputed",algorithm="brute",eps=eps, min_samples=minspl).fit(dis)
    db_labels = db.labels_
    db_core=db.core_sample_indices_
    db_n_clusters_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)    # set() to merge repeated elements
    
    db_result = get_result(db_n_clusters_, db_core, db_labels, candidates)
    
    ## kMedoids
    Medoids_result = []
    Medoids_points = cluster(dis, db_n_clusters_)
    #[print(candidates[pts])for pts in Medoids_points]
    [Medoids_result.append(candidates[pts]) for pts in Medoids_points]
    
    ##AffinityPropagation
    Aff = AffinityPropagation(affinity='precomputed').fit(dis)
    Aff_core = Aff.cluster_centers_indices_
    Aff_n_clusters = len(set(Aff.labels_))
    Aff_labels = Aff.labels_
  
    Aff_result = get_result(Aff_n_clusters, Aff_core, Aff_labels, candidates)
    
    
    return db_result, Medoids_result, Aff_result,words

    
def get_result(n_clusters_, core, labels, candidates):
    group=[[]*n_clusters_ for _ in range(n_clusters_)]   #create empty 2D array, (size : n_clusters_ * n_clusters_)
    for i in core:
        j=int(labels[i])
        group[j].append(candidates[i])         # getting each cluster's phrases
        #print("core_num: ", i, " ; candidates: ", candidates[i])
    result=[]
    for i in range(len(group)):
        re=group[i]
        if len(set(re))==1:         # set to merge same phrases of one row
            result.append(re[0])
        else:
            cnt = Counter(re)    # get each frequency and return most common one
            a=cnt.most_common(1)
            result.append(a[0][0])
    return result

In [539]:
def sim_eva(result):
    total_sim = 0
    itr_count = 1
    for idx1 in range(len(result)):
        for idx2 in range(idx1+1, len(result)):
            #print(model.wv.similarity(result[idx1],result[idx2]))
            sim = model.wv.similarity(result[idx1],result[idx2])
            sim = (0 if sim<0 else sim)
            itr_count +=1
            total_sim += sim
    #print('average similarity: ', total_sim/itr_count)
    return total_sim/itr_count

In [540]:
def freq_eva(result, words):
    if len(result)==0:   #empty result
        return 0
    total_freq = 0
    words_dic = Counter(words)
    for p in result:
        total_freq += words_dic[p]
    #print("frequency of occurance per phrase: ", total_freq/len(result))
    return total_freq/len(result)

In [546]:
def entire_eva(business_id):
    sim_DB=0
    sim_Medoid=0
    sim_Aff=0
    freq_DB=0
    freq_Meoid=0
    freq_Aff=0
    count = 0
    for ID_Idx in business_id.index[5:10]:    # test five business
        business_ID = business_id[ID_Idx]
        result_DB, result_Medoid, result_Aff, words = get_predict(business_ID)
        
        print("result_DB: ", result_DB)
        print("result_Medoid: ", result_Medoid)
        print("result_Aff: ", result_Aff)
        
        sim_DB += sim_eva(result_DB)
        sim_Medoid += sim_eva(result_Medoid)
        sim_Aff += sim_eva(result_Aff)
        
        freq_DB += freq_eva(result_DB, words)
        freq_Meoid += freq_eva(result_Medoid, words)
        freq_Aff += freq_eva(result_Aff, words)
        count += 1
    return sim_DB/count, freq_DB/count, sim_Medoid/count, freq_Meoid/count, sim_Aff/count, freq_Aff/count

In [548]:
print('dbscan average sim: ',DB_sim_eva)
print('dbscan average occurance frequency: ',DB_freq_eva)
print('kmedoid average sim: ',Medoid_sim_eva)
print('kmedoid average occurance frequency: ',Medoid_freq_eva)
print('AffinityPropagation average sim: ',Aff_sim_eva)
print('AffinityPropagation average occurance frequency: ',Aff_freq_eva)

dbscan average sim:  0.22009123048817666
dbscan average occurance frequency:  4.1066666666666665
kmedoid average sim:  0.07086704601624425
kmedoid average occurance frequency:  2.0466666666666664
AffinityPropagation average sim:  0.03758353989880901
AffinityPropagation average occurance frequency:  1.1733333333333333


In [547]:
%time DB_sim_eva, DB_freq_eva, Medoid_sim_eva, Medoid_freq_eva, Aff_sim_eva, Aff_freq_eva = entire_eva(business_id)

  """


result_DB:  ['veggie_gyro', 'pittsburgh_airport', 'tzatziki_sauce', 'lemon_chicken_soup', 'breakfast_sandwich']
result_Medoid:  ['specialty_sandwiches', 'food_sensitivity', 'quality_meal', 'worst_things', 'veggie_gyro']
result_Aff:  ['awesome_service', 'reduced_price', 'particular_reason', 'minute_wait', 'menu_item', 'didnt_need', 'dont_feel', 'youre_going', 'great_service', 'timely_manner']


  """


result_DB:  ['mexican_food', 'mama_marias', 'best_mexican_restaurant', 'enchiladas_verdes']
result_Medoid:  ['absolutely_appalled', 'legit_mexican_food', 'chunky_salsa', 'good_service']
result_Aff:  ['pollo_mole', 'favorite_dishes', 'good_restaurants', 'massive_meal', 'good_service', 'dont_care', 'peanut_butter_cookie', 'dont_judge', 'grilled_chicken', 'chicken_combo', 'butter_garlic_sauce', 'expensive_places', 'menu_items', 'good_price', 'great_deal', 'dozen_items', 'good_stuff', 'wait_staff', 'free_refill', 'white_meat', 'days_later', 'absolutely_delicious', 'good_things', 'family_restaurant', 'closing_time', 'saturday_night', 'service_wasnt', 'little_slow']


  """


result_DB:  ['coconut_whipped_cream', 'acai_bowl', 'beet_juice']
result_Medoid:  ['obviously_dont_care', 'grain_bowl', 'acai_bowl']
result_Aff:  ['good_experience', 'dont_want', 'havent_seen', 'gluten_free', 'place_popular', 'dont_know', 'recently_opened', 'super_friendly', 'people_serving', 'soooooooo_good', 'quick_bite', 'menu_items', 'hard_pressed', 'youre_going', 'dont_think']


  """


result_DB:  ['vegan_options', 'best_donut', 'dont_know', 'food_took', 'saturday_morning', 'brunch_menu']
result_Medoid:  ['forbidden_fruit', 'brunch_spots', 'vegan_friendly', 'table_minutes_later', 'friends_plate', 'smaller_group']
result_Aff:  ['vegan_options', 'high_quality', 'gluten_free', 'place_feels', 'dance_floor', 'super_nice', 'probably_worth', 'probably_wont', 'cold_water', 'didnt_bother', 'great_service', 'normal_breakfast', 'suggest_getting', 'fantastic_time']
result_DB:  []
result_Medoid:  []
result_Aff:  ['tried_different_appetizers', 'lightly_breaded', 'place_started', 'large_entree', 'honey_mustard', 'taco_place', 'food_quality', 'absolutely_delicious', 'totally_missed']
CPU times: user 14.6 s, sys: 167 ms, total: 14.7 s
Wall time: 14.9 s


  """
