In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

In [2]:
df = pd.read_pickle("../../Datasets/pruned_data.pkl")

In [3]:
df_grouped = df.groupby("gPlusUserId").agg({"gPlusPlaceId": list, "unixReviewTime": list}).reset_index()

In [4]:
df_grouped["sorted_places_w_timestamp"] = df_grouped.apply(
    lambda x: sorted(list(zip(x["gPlusPlaceId"], x["unixReviewTime"])), 
                     key=lambda item: item[1]), 
    axis=1)
df_grouped["sorted_places"] = df_grouped.apply(lambda x: [i[0] for i in x["sorted_places_w_timestamp"]], axis=1)
df_grouped["size"] = df_grouped.apply(lambda x: len(x["sorted_places"]), axis=1) 
df_grouped = df_grouped[df_grouped["size"] >= 7]

In [5]:
words = df_grouped["sorted_places"].tolist()

In [6]:
del df_grouped

In [7]:
model_i2v = Word2Vec(words,
                     min_count=5,  # Words/items with fewer instances are discarded
                     vector_size=32,  # Model dimensionality
                     window=3,  # Window size
                     sg=1,  # Skip-gram model
                     negative=2,
                     sample=1e-4,
                     epochs=10)

In [8]:
df_exploded = df.explode("categories")
df_grouped2 = df_exploded.groupby("categories").agg({"gPlusPlaceId": list, "unixReviewTime": list}).reset_index()
df_grouped2["sorted_places_w_timestamp"] = df_grouped2.apply(
    lambda x: sorted(list(zip(x["gPlusPlaceId"], x["unixReviewTime"])), 
                     key=lambda item: item[1]), 
    axis=1)
df_grouped2["sorted_places"] = df_grouped2.apply(lambda x: [i[0] for i in x["sorted_places_w_timestamp"]], axis=1)
df_categories_places = df_grouped2[["categories", "sorted_places"]]

In [9]:
category_item_map = {key: list(val) for key, val in zip(
    df_categories_places["categories"].tolist(), df_categories_places["sorted_places"].tolist())}

In [10]:
category_vectors = {}
for key in category_item_map:
    res = np.array([0.] * 32)
    count = 0
    for item in category_item_map[key]:
        if item in model_i2v.wv:
            res += model_i2v.wv[item] / np.linalg.norm(model_i2v.wv[item])
            count += 1
    if count > 0:
        category_vectors[key] = res / count

In [11]:
cosine_distance = lambda x, y: np.sum(x * y)

In [12]:
def get_top_n_similar(item, n=10):
    item_vector = category_vectors[item]
    dist = []
    
    for item2 in category_vectors:
        if item == item2:
            continue
        item2_vector = category_vectors[item2]
    
        dist.append((item2, cosine_distance(item_vector, item2_vector)))
    return sorted(dist, key=lambda x: -x[1])[:n]

In [13]:
category_vectors.keys()

dict_keys(['ATM Location', 'Adult Entertainment Club', 'Adult Entertainment Store', 'Afghani Restaurant', 'African Restaurant', 'Alsace Restaurant', 'Alternative Fuel Station', 'American Restaurant', 'Amphitheater', 'Amusement Center', 'Amusement Park', 'Andalusian Restaurant', 'Antique Furniture Store', 'Antique Store', 'Apartment Building', 'Apartment Complex', 'Apartment Rental Agency', 'Appliance Repair Service', 'Appliance Store', 'Appliances', 'Aquarium', 'Architectural Salvage Store', 'Area', 'Arena', 'Argentinian Restaurant', 'Art Center', 'Art Gallery', 'Art Museum', 'Art Supply Store', 'Arts Organization', 'Asian Fusion Restaurant', 'Asian Grocery Store', 'Asian Restaurant', 'Asturian Restaurant', 'Attraction', 'Audio Visual Equipment...', 'Auditorium', 'Australian Restaurant', 'Austrian Restaurant', 'Authentic Japanese  Restaurant', 'Auto Parts Store', 'Baby Clothing Store', 'Baby Store', 'Bagel Shop', 'Bait Shop', 'Bakery', 'Balloon Store', 'Ballroom', 'Banquet Hall', 'Bar'

In [14]:
get_top_n_similar("Refrigerator Repair Service")

KeyError: 'Refrigerator Repair Service'

In [15]:
get_top_n_similar("Indian Restaurant")

[('Coffee Houses & Cafes', 0.10497498688926032),
 ('Translator', 0.09695366041832937),
 ('Hungarian Restaurant', 0.0919454877851016),
 ('Vitamin & Supplements Store', 0.0892598346117201),
 ('Oyster Supplier', 0.0862577005386741),
 ('Milk Delivery Service', 0.08202923713396043),
 ('Burmese Restaurant', 0.0795552206206824),
 ('Soccer Club', 0.07859934684518947),
 ('Carpet Store', 0.07754458384758417),
 ('Ski Rental Service', 0.07529227548622622)]

In [16]:
get_top_n_similar("Restaurant")

[('Corporate Headquarters', 0.06486869320704843),
 ('Recording Studio', 0.06099483107730305),
 ('Translator', 0.059407667381143565),
 ('Coffee Houses & Cafes', 0.056910979138577594),
 ('Gymnastics Center', 0.05571083571803759),
 ('Key Duplication Service', 0.05400684916402328),
 ('Coral Reef', 0.05399244879223549),
 ('Milk Delivery Service', 0.05311799996767201),
 ('Central American Restaurant', 0.052252953996881984),
 ('Used Book Store', 0.05105084300095431)]

In [17]:
get_top_n_similar("American Restaurant")

[('Central American Restaurant', 0.05381481938222939),
 ('Carpet Store', 0.05094992921229254),
 ('Obanzai Restaurant', 0.05013049168596421),
 ('Coral Reef', 0.04496922362668111),
 ('Mattress Store', 0.043342252042404675),
 ('Ukrainian Restaurant', 0.04211235827508184),
 ('Jewelry Store', 0.04015358070478324),
 ('Bedroom Furniture Store', 0.0389274134741136),
 ('Farm', 0.038154525327837654),
 ('Watch Store', 0.037483688988169755)]

In [18]:
get_top_n_similar("ATM Location")

[('Hoagie Restaurant', 0.49837639016774915),
 ('Business Center', 0.49807418120460384),
 ('Flooring Store', 0.3928751669952993),
 ('Weddings', 0.3573086821604782),
 ('Gym', 0.34157371333742537),
 ('Snowboard Shop', 0.33255843070544366),
 ('Farm Shop', 0.3321675605569769),
 ('Appliances', 0.30288871007970536),
 ('Mobile Phone Repair Shop', 0.30288871007970536),
 ('Tutoring Service', 0.2940834392038135)]