In [1]:
from datetime import datetime
import os
import numpy as np
import pandas as pd
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.linalg import DenseVector, SparseVector, VectorUDT, Vectors
from pyspark.ml.clustering import GaussianMixture, KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, FloatType, IntegerType, DoubleType
spark_home = os.environ.get('SPARK_HOME', None)

import plotly
plotly.tools.set_credentials_file(username='amcire96', api_key='sej35ud4YbSOfIshhhZg')
# print(plotly.__version__)

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import requests
requests.packages.urllib3.disable_warnings()


In [20]:
business = spark.read.json("/user/hduser1/Yelp/business.json").repartition(300)
print(business.count())
business = business.where(col("categories").isNotNull())
print(business.count())
business.printSchema()
business.head(5)

144072
143747
root
 |-- address: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- city: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)



[Row(address=u'132 6th Street', attributes=[u'Alcohol: none', u"Ambience: {'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}", u'BikeParking: True', u'BusinessAcceptsCreditCards: True', u"BusinessParking: {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}", u'Caters: True', u'DogsAllowed: False', u'GoodForKids: True', u"GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': True, 'brunch': True}", u'HasTV: False', u'NoiseLevel: quiet', u'OutdoorSeating: True', u'RestaurantsAttire: casual', u'RestaurantsDelivery: False', u'RestaurantsGoodForGroups: True', u'RestaurantsPriceRange2: 2', u'RestaurantsReservations: True', u'RestaurantsTableService: True', u'RestaurantsTakeOut: True', u'WheelchairAccessible: True', u'WiFi: free'], business_id=u'8Ahmk-vBrOn47BRmjDGvMg', categories=[u'Specialty Food', u'Food', u'Breakfast & Bru

In [21]:
categories_attributes = business.select(["business_id", "categories"])

In [22]:
cv = CountVectorizer(inputCol="categories", outputCol="category_count", minDF=10)
cv_model = cv.fit(categories_attributes)
# pipeline = Pipeline(stages=[cv])
# model = pipeline.fit(categories_attributes.limit(10))

categories_cv = cv_model.transform(categories_attributes)
categories_cv.show()

print(len(cv_model.vocabulary))
sorted(cv_model.vocabulary)

+--------------------+--------------------+--------------------+
|         business_id|          categories|      category_count|
+--------------------+--------------------+--------------------+
|DYPbHkUtdq8HDecVx...|[Restaurants, Bur...|(896,[0,24],[1.0,...|
|ZeWgkWVOvUeauW9AR...|[Italian, Restaur...|(896,[0,21],[1.0,...|
|-Aj5ICZwvP25_HeXW...|[Skin Care, Beaut...|(896,[3,41,44,49,...|
|HvhlGm6pxelgDCY5a...|[Gyms, Trainers, ...|(896,[11,32,57,78...|
|gnREPCiK7JFk_iYmr...|[Barbers, Beauty ...|(896,[3,73],[1.0,...|
|H-THNj8sxPsNVvvFI...|[Car Rental, Hote...|(896,[19,152],[1....|
|9gLFnkwRiSppOBu3S...|[Diners, American...|(896,[0,26,29,99]...|
|aI3CUcnQbjn1jdcl6...|[Chinese, Restaur...|(896,[0,27],[1.0,...|
|gZn3niZkSVzmovEd5...|[Restaurants, Chi...|(896,[0,1,23,27,1...|
|7gQ321k5ZOcup-kuj...|[Hotels & Travel,...|(896,[10,19,40],[...|
|Z-6QSR2lC-opp9l6H...|[Chinese, Restaur...|(896,[0,27],[1.0,...|
|rhO7J9xmoS_jtZXqD...|[Home Services, H...|(896,[4,106],[1.0...|
|5cBPRkM5NpwbITBDh...|[Nu

[u'& Probates',
 u'ATV Rentals/Tours',
 u'Acai Bowls',
 u'Accessories',
 u'Accountants',
 u'Active Life',
 u'Acupuncture',
 u'Adult',
 u'Adult Education',
 u'Adult Entertainment',
 u'Advertising',
 u'Afghan',
 u'African',
 u'Air Duct Cleaning',
 u'Airlines',
 u'Airport Lounges',
 u'Airport Shuttles',
 u'Airports',
 u'Allergists',
 u'Amateur Sports Teams',
 u'American (New)',
 u'American (Traditional)',
 u'Amusement Parks',
 u'Anesthesiologists',
 u'Animal Shelters',
 u'Antiques',
 u'Apartments',
 u'Appliances',
 u'Appliances & Repair',
 u'Appraisal Services',
 u'Aquarium Services',
 u'Aquariums',
 u'Arabian',
 u'Arcades',
 u'Archery',
 u'Architects',
 u'Argentine',
 u'Art Classes',
 u'Art Galleries',
 u'Art Museums',
 u'Art Schools',
 u'Art Supplies',
 u'Artificial Turf',
 u'Arts & Crafts',
 u'Arts & Entertainment',
 u'Asian Fusion',
 u'Assisted Living Facilities',
 u'Auction Houses',
 u'Audiologist',
 u'Austrian',
 u'Auto Customization',
 u'Auto Detailing',
 u'Auto Glass Services',
 u

minDF=1: 1191

minDF=10: 896

minDF=100:473

In [23]:
categories_cv.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_count: vector (nullable = true)



In [None]:
gmm = GaussianMixture(k=10, featuresCol="category_count", predictionCol="cluster_num_pred", probabilityCol="pred_probability")
gmm_model = gmm.fit(categories_cv)
summary = gmm_model.summary

In [24]:
kmeans = KMeans(k=100, featuresCol="category_count", predictionCol="cluster_num_pred")
kmeans_model = kmeans.fit(categories_cv)
summary = kmeans_model.summary

print(summary.clusterSizes)
# print(kmeans_model.clusterCenters())



[3273, 2158, 924, 2916, 615, 4779, 9201, 279, 648, 2812, 1666, 2101, 1181, 835, 1153, 1726, 576, 760, 858, 846, 246, 2112, 869, 2936, 1356, 569, 1329, 1023, 912, 2381, 414, 1129, 911, 2233, 1113, 608, 2535, 3584, 1861, 1158, 1695, 1071, 529, 2866, 638, 2109, 1124, 411, 3025, 1315, 780, 1109, 5606, 892, 1044, 160, 1301, 472, 2013, 2503, 1084, 604, 318, 458, 1122, 1866, 364, 1773, 683, 1018, 1102, 3208, 639, 1345, 1450, 777, 1147, 934, 1760, 2149, 855, 955, 1276, 462, 292, 1316, 578, 349, 333, 1065, 662, 484, 507, 472, 3026, 3229, 1547, 170, 2298, 821]


In [29]:
top_clusters_cats = [(np.where(x > 0.9)[0], x[np.where(x > 0.9)]) for x in kmeans_model.clusterCenters()]
# print(top_clusters_cats)
sorted([[(cv_model.vocabulary[elem[0]], elem[1]) for elem in zip(arr, vals)] for (arr, vals) in top_clusters_cats])

[[],
 [],
 [(u'Active Life', 1.0)],
 [(u'Active Life', 1.0), (u'Fitness & Instruction', 1.0)],
 [(u'Active Life', 1.0), (u'Fitness & Instruction', 1.0)],
 [(u'Automotive', 0.99999999999999989)],
 [(u'Automotive', 0.99999999999999989), (u'Auto Repair', 0.99999999999999989)],
 [(u'Automotive', 0.99999999999999989),
  (u'Gas & Service Stations', 0.99999999999999989)],
 [(u'Automotive', 1.0), (u'Auto Detailing', 1.0)],
 [(u'Automotive', 1.0), (u'Car Dealers', 0.9962630792227205)],
 [(u'Beauty & Spas', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Barbers', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Hair Removal', 1.0)],
 [(u'Beauty & Spas', 1.0), (u'Hair Salons', 1.0)],
 [(u'Beauty & Spas', 1.0),
  (u'Hair Salons', 1.0),
  (u'Hair Stylists', 0.97101449275362317),
  (u"Men's Hair Salons", 0.95652173913043481)],
 [(u'Beauty & Spas', 1.0), (u'Hair Salons', 1.0), (u'Hair Stylists', 1.0)],
 [(u'Beauty & Spas', 1.0),
  (u'Health & Medical', 0.99770642201834869),
  (u'Massage', 0.93807339449541294)],
 [(u'Beaut

In [7]:
exploded_cats = categories_attributes.select([col("business_id"), explode("categories").alias("category")])
# exploded_cats.head(10)
overlapping_cats_pairs = (exploded_cats.alias("t1")
                          .join(exploded_cats.alias("t2"), (col("t1.business_id") < col("t2.business_id")) & (col("t1.category") == col("t2.category")))
                          .select(["t1.business_id", "t2.business_id"]).distinct()
                         )
# overlapping_cats_pairs = (exploded_cats.alias("t1")
#                           .join(categories_attributes.alias("t2"), (col("t1.business_id") < col("t2.business_id")) & (expr("array_contains(categories, category)")))
#                           .select(["t1.business_id", "t2.business_id"]).distinct())
overlapping_cats_pairs.head(20)

KeyboardInterrupt: 

In [24]:
idf = IDF(minDocFreq=10, inputCol="category_count", outputCol="categories_idf")
idf_model = idf.fit(categories_cv)
# print(idf_model.idf)

categories_idf = idf_model.transform(categories_cv)
categories_idf.show()

+--------------------+--------------------+--------------------+--------------------+
|         business_id|          categories|      category_count|      categories_idf|
+--------------------+--------------------+--------------------+--------------------+
|CYIQVaW41UoDDikFl...|[Arts & Entertain...|(896,[20,219],[1....|(896,[20,219],[3....|
|9ezDpuzoOhkDgzfwM...|[Sandwiches, Rest...|(896,[0,15,16],[1...|(896,[0,15,16],[1...|
|bOS7Iy2FOy1Nd3qPk...|[Mobile Phone Acc...|(896,[1,9,118,224...|(896,[1,9,118,224...|
|LEASX4wn1U1TAmwQf...|[Arts & Entertain...|(896,[10,11,19,20...|(896,[10,11,19,20...|
|GSOYOUQa5pY19EP3J...|[Chinese, Restaur...|(896,[0,27],[1.0,...|(896,[0,27],[1.08...|
|TG6m1XuLFgbYRGo-4...|[Towing, Auto Rep...|(896,[1,8,20,22,7...|(896,[1,8,20,22,7...|
|kXLz9O3vHAoxYVN4X...|[Nightlife, Adult...|(896,[3,5,18,30,2...|(896,[3,5,18,30,2...|
|HKFOt7gx-WCPJhTNb...|[Education, Colle...|(896,[48,337],[1....|(896,[48,337],[4....|
|gPuj_VV2Yzj-IhqSV...|[Nightlife, Wine ...|(896,[5,7,1

In [25]:
restaurants_idf = (categories_idf
                  .where(expr("array_contains(categories, 'Restaurants')")))
                          
restaurants_idf.head(10)

[Row(business_id=u'BzXtEUHu86ju9ttE-X3saw', categories=[u'Italian', u'Food Delivery Services', u'Food', u'Pizza', u'Greek', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 2: 1.0, 15: 1.0, 21: 1.0, 114: 1.0, 155: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 2: 1.9145, 15: 3.3137, 21: 3.5525, 114: 5.079, 155: 5.4413})),
 Row(business_id=u'3cu4pGmmMf-ZSL69nP1lKQ', categories=[u'Food', u'Specialty Food', u'Ethnic Food', u'Middle Eastern', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 2: 1.0, 31: 1.0, 116: 1.0, 140: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 2: 1.9145, 31: 3.7675, 116: 5.0846, 140: 5.2651})),
 Row(business_id=u'L6_B4Z_29vjBTUPWFWZu3Q', categories=[u'Breakfast & Brunch', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 29: 1.0}), categories_idf=SparseVector(896, {0: 1.0868, 29: 3.6991})),
 Row(business_id=u'bYvm_7jKKaewN2kFLFF4TA', categories=[u'Sandwiches', u'Restaurants'], category_count=SparseVector(896, {0: 1.0, 16: 1.0}

In [None]:
def dot_prod(v1, v2): 
    return float(v1.dot(v2))
#     return np.dot(v1.toArray(), v2.toArray())

# def join_cond(t1_bid, t1_idf, t2_bid, t2_idf):
    

f = udf(dot_prod, FloatType())
# n = udf(lambda v1: v1.norm(2), FloatType())

restaurant_network = (restaurants_idf.select(["business_id", "categories_idf"]).alias("t1")
           .join(restaurants_idf.select(["business_id", "categories_idf"]).alias("t2"), col("t1.business_id") < col("t2.business_id"))
           .select([col("t1.business_id"), col("t2.business_id"), f(col("t1.categories_idf"), col("t2.categories_idf")).alias("dot")])
           .orderBy(["dot"], ascending=[0])
            )
#            .cache())
           
# network = network.filter(col("dot") > 0).orderBy([col("dot")], ascending=[0])
#            .select([col("t1.business_id"), n(col("t1.categories_idf")).alias("dot")]))
restaurant_network.head(20)

In [15]:
kmeans_idf = KMeans(k=100, featuresCol="categories_idf", predictionCol="cluster_num_pred_idf")
kmeans_idf_model = kmeans_idf.fit(categories_idf)
summary = kmeans_idf_model.summary

print(summary.clusterSizes)
# print(kmeans_idf_model.clusterCenters())

top_clusters_cats = [(np.where(x > 0.9)[0], x[np.where(x > 0.9)]) for x in kmeans_idf_model.clusterCenters()]
sorted([[(cv_model.vocabulary[elem[0]], elem[1]) for elem in zip(arr, vals)] for (arr, vals) in top_clusters_cats])

[2380, 3216, 910, 1104, 1684, 1501, 648, 2512, 608, 1230, 2280, 1186, 1750, 2840, 879, 167, 837, 1522, 1304, 1202, 609, 606, 791, 2436, 1522, 169, 29104, 605, 2279, 828, 161, 2819, 1205, 699, 1309, 900, 676, 428, 554, 278, 1329, 262, 1813, 1246, 249, 3313, 842, 2413, 545, 284, 3149, 1102, 625, 889, 896, 510, 1375, 1021, 1718, 4518, 666, 290, 100, 311, 630, 533, 1003, 1706, 319, 155, 666, 774, 584, 1317, 547, 203, 753, 288, 2571, 905, 907, 858, 757, 269, 656, 5141, 1346, 813, 344, 3200, 258, 869, 279, 332, 2336, 365, 921, 2064, 690, 1184]


[[],
 [(u'Active Life', 3.0625272830502825),
  (u'Fitness & Instruction', 3.78572126233838),
  (u'Gyms', 3.4567221248193247),
  (u'Trainers', 4.2531831333548018)],
 [(u'Active Life', 3.062527283050287),
  (u'Fitness & Instruction', 3.7857212623383876),
  (u'Gyms', 4.4421505053531591)],
 [(u'Active Life', 3.0625272830502932),
  (u'Fitness & Instruction', 1.0714305459448248),
  (u'Parks', 0.99763051176085082)],
 [(u'Active Life', 3.0625272830502981),
  (u'Arts & Entertainment', 1.2638568513133326),
  (u'Fitness & Instruction', 3.7857212623383707),
  (u'Education', 1.5748638414360183),
  (u'Specialty Schools', 1.7959738135481464),
  (u'Performing Arts', 1.646335662411919),
  (u'Dance Studios', 6.1954972244656563),
  (u'Dance Schools', 2.1847260255472771)],
 [(u'Active Life', 3.0625272830502985),
  (u'Parks', 1.7800468223134962),
  (u'Hiking', 6.7400186084690681)],
 [(u'Arts & Entertainment', 3.4158293278738698),
  (u'Performing Arts', 5.1364469986979078)],
 [(u'Automotive', 2.821545859193