In [6]:
from datetime import datetime
import os
import numpy as np
import pandas as pd
from pyspark.ml.feature import CountVectorizer, IDF, PCA, Word2Vec
from pyspark.ml.linalg import DenseVector, SparseVector, VectorUDT, Vectors
from pyspark.ml.clustering import GaussianMixture, KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, FloatType, IntegerType, DoubleType
spark_home = os.environ.get('SPARK_HOME', None)

import plotly
plotly.tools.set_credentials_file(username='amcire96', api_key='sej35ud4YbSOfIshhhZg')
# print(plotly.__version__)

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import requests
requests.packages.urllib3.disable_warnings()


In [2]:
business = spark.read.json("/user/hduser1/Yelp/business.json").repartition(300)
print(business.count())
business = business.where(col("categories").isNotNull()).where(col("review_count")>10)
print(business.count())
business.printSchema()
business.head(5)

144072
63104
root
 |-- address: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- city: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)



[Row(address=u'7523 Pearl Rd', attributes=[u'Alcohol: beer_and_wine', u"Ambience: {'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}", u'BikeParking: True', u'BusinessAcceptsCreditCards: True', u"BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", u'Caters: True', u'DogsAllowed: False', u'DriveThru: False', u'GoodForKids: True', u"GoodForMeal: {'dessert': True, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': False, 'brunch': False}", u'HasTV: True', u'NoiseLevel: average', u'OutdoorSeating: False', u'RestaurantsAttire: casual', u'RestaurantsDelivery: True', u'RestaurantsGoodForGroups: True', u'RestaurantsPriceRange2: 2', u'RestaurantsReservations: True', u'RestaurantsTableService: True', u'RestaurantsTakeOut: True', u'WheelchairAccessible: True', u'WiFi: free'], business_id=u'CS2oAjAiFrJiG2HUtKJUHQ', categories

In [3]:
categories_attributes = business.select(["business_id", "categories"])

minDF=1: 1191

minDF=10: 896

minDF=100:473

In [7]:
word2vec = Word2Vec(vectorSize=200, minCount=10, inputCol="categories", outputCol="word2vec_categories")
word2vec_model = word2vec.fit(categories_attributes)
categories_word2vec = word2vec_model.transform(categories_attributes)

In [9]:
pca = PCA(k=100, inputCol="word2vec_categories", outputCol="pca_word2vec")
pca_model = pca.fit(categories_word2vec)
pca_word2vec_transformed = pca_model.transform(categories_word2vec)

gmm = GaussianMixture(k=10, featuresCol="pca_word2vec", predictionCol="cluster_num_pred", probabilityCol="pred_probability")
gmm_model = gmm.fit(pca_word2vec_transformed)
summary = gmm_model.summary

Exception AttributeError: "'PCA' object has no attribute '_java_obj'" in 

AttributeError: 'PCAModel' object has no attribute 'tranform'

In [11]:
kmeans = KMeans(k=100, featuresCol="word2vec_categories", predictionCol="cluster_num_pred")
kmeans_model = kmeans.fit(categories_word2vec)
summary = kmeans_model.summary

print(summary.clusterSizes)
# print(kmeans_model.clusterCenters())



[767, 861, 301, 915, 195, 1148, 745, 409, 715, 813, 963, 1413, 1344, 866, 470, 802, 360, 555, 1401, 612, 359, 1155, 482, 276, 261, 247, 960, 1118, 1361, 563, 347, 420, 916, 427, 657, 652, 1001, 73, 316, 107, 576, 571, 758, 694, 1804, 570, 661, 774, 630, 354, 369, 273, 469, 364, 790, 657, 354, 424, 199, 1328, 1010, 837, 607, 985, 171, 379, 788, 554, 190, 376, 365, 1210, 1012, 769, 543, 616, 540, 320, 167, 1134, 87, 311, 307, 941, 969, 559, 1071, 538, 45, 527, 564, 427, 864, 276, 931, 598, 515, 244, 281, 504]


In [13]:
top_clusters_cats = [(x.argsort()[-3:][::-1], x[x.argsort()[-3:][::-1]]) for x in kmeans_model.clusterCenters()]
top_clusters_cats
# sorted([[(cv_model.vocabulary[elem[0]], elem[1]) for elem in zip(arr, vals)] for (arr, vals) in top_clusters_cats])

[(array([191, 184,  58]), array([ 0.348293  ,  0.30422422,  0.30206075])),
 (array([ 42, 128, 131]), array([ 0.47944322,  0.39214722,  0.37816715])),
 (array([ 11,  84, 110]), array([ 0.2963783 ,  0.29253959,  0.26505199])),
 (array([174, 121,  42]), array([ 0.52530858,  0.50611063,  0.49817832])),
 (array([ 81, 115, 168]), array([ 0.39015668,  0.34501581,  0.34499954])),
 (array([174, 121,  42]), array([ 0.52604959,  0.49084959,  0.4892654 ])),
 (array([ 42, 159, 186]), array([ 0.55428631,  0.49919414,  0.46790695])),
 (array([50, 77, 62]), array([ 0.34942274,  0.33545212,  0.29159501])),
 (array([ 63, 178, 123]), array([ 0.74259543,  0.60600426,  0.56419508])),
 (array([119,  52,  91]), array([ 0.45927605,  0.44649998,  0.44099792])),
 (array([109, 110,  21]), array([ 0.32971281,  0.30504707,  0.29429039])),
 (array([163,  81,  91]), array([ 0.53365966,  0.46980108,  0.44237741])),
 (array([105,  81,  30]), array([ 0.44628767,  0.41482998,  0.37823561])),
 (array([ 21, 184, 187]), ar

In [14]:
cv = CountVectorizer(inputCol="categories", outputCol="category_count", minDF=10)
cv_model = cv.fit(categories_attributes)
# pipeline = Pipeline(stages=[cv])
# model = pipeline.fit(categories_attributes.limit(10))

categories_cv = cv_model.transform(categories_attributes)
categories_cv.show()

print(len(cv_model.vocabulary))
sorted(cv_model.vocabulary)

+--------------------+--------------------+--------------------+
|         business_id|          categories|      category_count|
+--------------------+--------------------+--------------------+
|Eg9ZRsV5t64bWkK1N...|[Automotive, Auto...|(711,[12,28],[1.0...|
|iyr2-mlj0oz04ADOO...|[Restaurants, Ame...|(711,[0,6,17,32],...|
|MX5uyDRTey35fiGgR...|[Paint & Sip, Eve...|(711,[3,7,22,109,...|
|GPGeSCvdPT_2La3Xz...|[Garage Door Serv...|(711,[8,341],[1.0...|
|js21hDzpLZNkwWHwD...|[Shipping Centers...|(711,[20,191,207,...|
|tryuF3QW5he0cUmd1...|[Gastropubs, Pubs...|(711,[0,2,4,35,11...|
|_-u3js_j6WHkRZ416...|[Restaurants, Fre...|(711,[0,16,36,81,...|
|SsJAn_JvZC4cbAg_e...|[American (Tradit...|(711,[0,6,19,83],...|
|_yMMBQXcUQ1XtHQVa...|[Medical Centers,...|(711,[9,43,124,14...|
|j7Gj8RjIVFotp3SfV...|    [Food, Bakeries]|(711,[1,31],[1.0,...|
|301_dHmNWl6qxaTLD...|[Shopping, Fashio...|(711,[3,27,77,97,...|
|dEh3Su-0GWDSQZYRl...|[Farmers Market, ...|(711,[1,251],[1.0...|
|RKgDsDhUJhjtmL-6I...|[Fi

[u'ATV Rentals/Tours',
 u'Acai Bowls',
 u'Accessories',
 u'Accountants',
 u'Active Life',
 u'Acupuncture',
 u'Adult',
 u'Adult Education',
 u'Adult Entertainment',
 u'Advertising',
 u'Afghan',
 u'African',
 u'Air Duct Cleaning',
 u'Airlines',
 u'Airport Lounges',
 u'Airport Shuttles',
 u'Airports',
 u'Allergists',
 u'Amateur Sports Teams',
 u'American (New)',
 u'American (Traditional)',
 u'Amusement Parks',
 u'Animal Shelters',
 u'Antiques',
 u'Apartments',
 u'Appliances',
 u'Appliances & Repair',
 u'Appraisal Services',
 u'Aquarium Services',
 u'Aquariums',
 u'Arabian',
 u'Arcades',
 u'Archery',
 u'Argentine',
 u'Art Classes',
 u'Art Galleries',
 u'Art Museums',
 u'Art Schools',
 u'Art Supplies',
 u'Arts & Crafts',
 u'Arts & Entertainment',
 u'Asian Fusion',
 u'Auction Houses',
 u'Auto Customization',
 u'Auto Detailing',
 u'Auto Glass Services',
 u'Auto Insurance',
 u'Auto Loan Providers',
 u'Auto Parts & Supplies',
 u'Auto Repair',
 u'Automotive',
 u'Baby Gear & Furniture',
 u'Bagels

In [15]:
categories_cv.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_count: vector (nullable = true)



In [None]:
pca = PCA(k=100, inputCol="category_count", outputCol="pca_category_count")
pca_model = pca.fit(categories_cv)
pca_transformed = pca_model.transform(categories_cv)

gmm = GaussianMixture(k=10, featuresCol="pca_category_count", predictionCol="cluster_num_pred", probabilityCol="pred_probability")
gmm_model = gmm.fit(pca_transformed)
summary = gmm_model.summary

In [16]:
kmeans = KMeans(k=100, featuresCol="category_count", predictionCol="cluster_num_pred")
kmeans_model = kmeans.fit(categories_cv)
summary = kmeans_model.summary

print(summary.clusterSizes)
# print(kmeans_model.clusterCenters())



[1104, 522, 1197, 1600, 804, 572, 2958, 438, 309, 83, 1600, 2702, 615, 1576, 1353, 830, 533, 982, 945, 304, 1644, 448, 1363, 1018, 952, 349, 655, 279, 702, 280, 292, 1190, 978, 1655, 42, 453, 1343, 372, 528, 531, 236, 143, 117, 1455, 1090, 392, 271, 1156, 109, 817, 269, 410, 1116, 361, 973, 184, 642, 330, 431, 140, 640, 494, 221, 439, 174, 530, 511, 1037, 380, 103, 878, 258, 364, 27, 396, 244, 883, 81, 298, 430, 296, 469, 1636, 79, 210, 153, 114, 133, 98, 991, 280, 162, 729, 951, 144, 67, 402, 273, 441, 345]


In [17]:
top_clusters_cats = [(np.where(x > 0.9)[0], x[np.where(x > 0.9)]) for x in kmeans_model.clusterCenters()]
# print(top_clusters_cats)
sorted([[(cv_model.vocabulary[elem[0]], elem[1]) for elem in zip(arr, vals)] for (arr, vals) in top_clusters_cats])

[[],
 [(u'Active Life', 1.0)],
 [(u'Active Life', 1.0), (u'Fitness & Instruction', 1.0)],
 [(u'Active Life', 1.0),
  (u'Fitness & Instruction', 1.0),
  (u'Gyms', 0.94436090225563918)],
 [(u'Active Life', 1.0), (u'Parks', 1.0)],
 [(u'Arts & Entertainment', 1.0), (u'Museums', 1.0)],
 [(u'Arts & Entertainment', 1.0), (u'Psychics & Astrologers', 1.0)],
 [(u'Automotive', 0.99999999999999989)],
 [(u'Automotive', 0.99999999999999989), (u'Car Dealers', 0.99999999999999989)],
 [(u'Automotive', 1.0)],
 [(u'Beauty & Spas', 1.0)],
 [(u'Beauty & Spas', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Education', 1.0), (u'Specialty Schools', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Hair Removal', 0.99806201550387597)],
 [(u'Beauty & Spas', 1.0), (u'Hair Removal', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Hair Salons', 1.0)],
 [(u'Beauty & Spas', 1.0),
  (u'Hair Salons', 1.0),
  (u'Hair Stylists', 0.92789968652037613)],
 [(u'Beauty & Spas', 1.0), (u'Massage', 0.95278246205733552)],
 [(u'Education', 1.0)],
 [(u'Event Plan

In [10]:
exploded_cats = categories_attributes.select([col("business_id"), explode("categories").alias("category")])
categories = exploded_cats.groupby("category").count().where(col("count")>=10).select("category")
print(categories.count())

711


In [None]:

# exploded_cats.head(10)
# overlapping_cats_pairs = (exploded_cats.alias("t1")
#                           .join(exploded_cats.alias("t2"), (col("t1.business_id") < col("t2.business_id")) & (col("t1.category") == col("t2.category")))
#                           .select(["t1.business_id", "t2.business_id"]).distinct()
#                          )
# overlapping_cats_pairs = (exploded_cats.alias("t1")
#                           .join(categories_attributes.alias("t2"), (col("t1.business_id") < col("t2.business_id")) & (expr("array_contains(categories, category)")))
#                           .select(["t1.business_id", "t2.business_id"]).distinct())

overlapping_cats_pairs = (categories_attributes.alias("t1")
                         .join(categories_attributes.alias("t2"), col("t1.business_id")<col("t2.business_id"))
                         .join(categories.alias("c"), expr("array_contains(t1.categories, c.category)") & expr("array_contains(t2.categories, c.category)"))
                         .select(["t1.business_id", "t2.business_id"])
                         .distinct()
                        )


overlapping_cats_pairs.head(20)

In [24]:
idf = IDF(minDocFreq=10, inputCol="category_count", outputCol="categories_idf")
idf_model = idf.fit(categories_cv)
# print(idf_model.idf)

categories_idf = idf_model.transform(categories_cv)
categories_idf.show()

+--------------------+--------------------+--------------------+--------------------+
|         business_id|          categories|      category_count|      categories_idf|
+--------------------+--------------------+--------------------+--------------------+
|CYIQVaW41UoDDikFl...|[Arts & Entertain...|(896,[20,219],[1....|(896,[20,219],[3....|
|9ezDpuzoOhkDgzfwM...|[Sandwiches, Rest...|(896,[0,15,16],[1...|(896,[0,15,16],[1...|
|bOS7Iy2FOy1Nd3qPk...|[Mobile Phone Acc...|(896,[1,9,118,224...|(896,[1,9,118,224...|
|LEASX4wn1U1TAmwQf...|[Arts & Entertain...|(896,[10,11,19,20...|(896,[10,11,19,20...|
|GSOYOUQa5pY19EP3J...|[Chinese, Restaur...|(896,[0,27],[1.0,...|(896,[0,27],[1.08...|
|TG6m1XuLFgbYRGo-4...|[Towing, Auto Rep...|(896,[1,8,20,22,7...|(896,[1,8,20,22,7...|
|kXLz9O3vHAoxYVN4X...|[Nightlife, Adult...|(896,[3,5,18,30,2...|(896,[3,5,18,30,2...|
|HKFOt7gx-WCPJhTNb...|[Education, Colle...|(896,[48,337],[1....|(896,[48,337],[4....|
|gPuj_VV2Yzj-IhqSV...|[Nightlife, Wine ...|(896,[5,7,1

In [25]:
restaurants_idf = (categories_idf
                  .where(expr("array_contains(categories, 'Restaurants')")))
                          
restaurants_idf.head(10)

[Row(business_id=u'BzXtEUHu86ju9ttE-X3saw', categories=[u'Italian', u'Food Delivery Services', u'Food', u'Pizza', u'Greek', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 2: 1.0, 15: 1.0, 21: 1.0, 114: 1.0, 155: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 2: 1.9145, 15: 3.3137, 21: 3.5525, 114: 5.079, 155: 5.4413})),
 Row(business_id=u'3cu4pGmmMf-ZSL69nP1lKQ', categories=[u'Food', u'Specialty Food', u'Ethnic Food', u'Middle Eastern', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 2: 1.0, 31: 1.0, 116: 1.0, 140: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 2: 1.9145, 31: 3.7675, 116: 5.0846, 140: 5.2651})),
 Row(business_id=u'L6_B4Z_29vjBTUPWFWZu3Q', categories=[u'Breakfast & Brunch', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 29: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 29: 3.6991})),
 Row(business_id=u'bYvm_7jKKaewN2kFLFF4TA', categories=[u'Sandwiches', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 16: 1.0}

In [None]:
def dot_prod(v1, v2): 
    return float(v1.dot(v2))
#     return np.dot(v1.toArray(), v2.toArray())

# def join_cond(t1_bid, t1_idf, t2_bid, t2_idf):
    

f = udf(dot_prod, FloatType())
# n = udf(lambda v1: v1.norm(2), FloatType())

restaurant_network = (restaurants_idf.select(["business_id", "categories_idf"]).alias("t1")
           .join(restaurants_idf.select(["business_id", "categories_idf"]).alias("t2"), col("t1.business_id") < col("t2.business_id"))
           .select([col("t1.business_id"), col("t2.business_id"), f(col("t1.categories_idf"), col("t2.categories_idf")).alias("dot")])
           .orderBy(["dot"], ascending=[0])
            )
#            .cache())
           
# network = network.filter(col("dot") > 0).orderBy([col("dot")], ascending=[0])
#            .select([col("t1.business_id"), n(col("t1.categories_idf")).alias("dot")]))
restaurant_network.head(20)

In [15]:
kmeans_idf = KMeans(k=100, featuresCol="categories_idf", predictionCol="cluster_num_pred_idf")
kmeans_idf_model = kmeans_idf.fit(categories_idf)
summary = kmeans_idf_model.summary

print(summary.clusterSizes)
# print(kmeans_idf_model.clusterCenters())

top_clusters_cats = [(np.where(x > 0.9)[0], x[np.where(x > 0.9)]) for x in kmeans_idf_model.clusterCenters()]
sorted([[(cv_model.vocabulary[elem[0]], elem[1]) for elem in zip(arr, vals)] for (arr, vals) in top_clusters_cats])

[2380, 3216, 910, 1104, 1684, 1501, 648, 2512, 608, 1230, 2280, 1186, 1750, 2840, 879, 167, 837, 1522, 1304, 1202, 609, 606, 791, 2436, 1522, 169, 29104, 605, 2279, 828, 161, 2819, 1205, 699, 1309, 900, 676, 428, 554, 278, 1329, 262, 1813, 1246, 249, 3313, 842, 2413, 545, 284, 3149, 1102, 625, 889, 896, 510, 1375, 1021, 1718, 4518, 666, 290, 100, 311, 630, 533, 1003, 1706, 319, 155, 666, 774, 584, 1317, 547, 203, 753, 288, 2571, 905, 907, 858, 757, 269, 656, 5141, 1346, 813, 344, 3200, 258, 869, 279, 332, 2336, 365, 921, 2064, 690, 1184]


[[],
 [(u'Active Life', 3.0625272830502825),
  (u'Fitness & Instruction', 3.78572126233838),
  (u'Gyms', 3.4567221248193247),
  (u'Trainers', 4.2531831333548018)],
 [(u'Active Life', 3.062527283050287),
  (u'Fitness & Instruction', 3.7857212623383876),
  (u'Gyms', 4.4421505053531591)],
 [(u'Active Life', 3.0625272830502932),
  (u'Fitness & Instruction', 1.0714305459448248),
  (u'Parks', 0.99763051176085082)],
 [(u'Active Life', 3.0625272830502981),
  (u'Arts & Entertainment', 1.2638568513133326),
  (u'Fitness & Instruction', 3.7857212623383707),
  (u'Education', 1.5748638414360183),
  (u'Specialty Schools', 1.7959738135481464),
  (u'Performing Arts', 1.646335662411919),
  (u'Dance Studios', 6.1954972244656563),
  (u'Dance Schools', 2.1847260255472771)],
 [(u'Active Life', 3.0625272830502985),
  (u'Parks', 1.7800468223134962),
  (u'Hiking', 6.7400186084690681)],
 [(u'Arts & Entertainment', 3.4158293278738698),
  (u'Performing Arts', 5.1364469986979078)],
 [(u'Automotive', 2.821545859193