In [1]:
from datetime import datetime
import os
import numpy as np
import pandas as pd
from pyspark.ml.feature import CountVectorizer, IDF, Word2Vec, PCA
from pyspark.ml.linalg import DenseVector, SparseVector, VectorUDT, Vectors
from pyspark.ml.clustering import GaussianMixture, KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, FloatType, IntegerType, DoubleType

spark_home = os.environ.get('SPARK_HOME', None)

import plotly
plotly.tools.set_credentials_file(username='amcire96', api_key='sej35ud4YbSOfIshhhZg')
# print(plotly.__version__)

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF
import requests
requests.packages.urllib3.disable_warnings()

In [47]:
business = spark.read.json("/user/hduser1/Yelp/business.json").repartition(300)
print(business.count())
# business = business.where(col("categories").isNotNull())
# print(business.count())
business.printSchema()
# business.head(5)

144072
root
 |-- address: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- city: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)



In [48]:
reviews = spark.read.json("/user/hduser1/Yelp/review.json").repartition(300)
reviews.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [40]:
def get_business_info(business_id, categories="*"):
    print(business.select(categories).where(col("business_id") == business_id).head())

In [52]:
business_reviews = business.join(reviews, "business_id")

In [53]:


def get_business_reviews(business_id):
    print(business_reviews.where(col("business_id") == business_id).select("text").collect())

In [10]:
get_business_info("stfzrvR5C9TkcWBof-RgxA")

Row(address=u'Fringe Salon, 109 W Honeysuckle St', attributes=[u'BusinessAcceptsBitcoin: False', u'BusinessAcceptsCreditCards: False', u"BusinessParking: {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}", u'ByAppointmentOnly: True', u'RestaurantsPriceRange2: 2', u'WheelchairAccessible: False'], business_id=u'stfzrvR5C9TkcWBof-RgxA', categories=[u'Beauty & Spas', u'Skin Care', u'Nail Salons', u'Permanent Makeup', u'Eyelash Service'], city=u'Litchfield Park', hours=None, is_open=0, latitude=33.4945653, longitude=-112.3595479, name=u"Jamie's Vanity", neighborhood=u'', postal_code=u'85340', review_count=3, stars=5.0, state=u'AZ', type=u'business')


In [16]:
false_positive = sc.textFile("/user/hduser1/false_positive.csv") \
    .map(lambda line: line.split(",")) \
    .filter(lambda line: len(line)>1) \
    .map(lambda line: (line[0],line[1])) \
#     .collect()
    
# false_positive

In [20]:
f = false_positive.toDF().toPandas()
business_ids = f[100:200]["_2"]

In [55]:
for business_id in business_ids[::10]:
    business_id = str(business_id[1:-1])
    print(business_id)

#     get_business_info(business_id, ["state", "categories", "hours"])
    get_business_info(business_id, ["name", "address", "state"])
#     get_business_reviews(business_id)

7zSVeeWX-8uVN5LHSzRQtA
Row(name=u"Southwest Women's Care", address=u'1450 S Dobson Road', state=u'AZ')
VjoY6LuDh6ZfqyHfHZ5sFw
Row(name=u'Chevy Shop', address=u'2960 Westwood Dr, Ste 4', state=u'NV')
eK1uUHDuCs53CuBM4kURxQ
Row(name=u'Cafe Depot', address=u'1490 Boulevard de Maisonneuve Ouest', state=u'QC')
UHvEOtWO-PaJLhR0FbsByQ
Row(name=u'i Nails', address=u'3870 E Flamingo Rd, Ste A5', state=u'NV')
A42_tYr2vKBU5wMdh01nEQ
Row(name=u'Belong Hair Extensions-Carolyn Hart', address=u'8751 W Charleston Blvd', state=u'NV')
fKsrsJV6CMz2AWUUz1FwpQ
Row(name=u'Left Foot Right Foot', address=u'720 Burnhamthorpe Road W, Unit 32A', state=u'ON')
FP54OYs2rfXWa6PosSLxvw
Row(name=u'Bingbings Shaved Ice', address=u'Excalibur Hotel and Casino Pool, 3850 S Las Vegas Blvd', state=u'NV')
BwZEj2VB-2rlPMonDckvog
Row(name=u'Bella Dia Massage', address=u'21448 N 75th Ave, Ste 10', state=u'AZ')
FMtluZEVTYvHN4bIZi734w
Row(name=u"Love'in Noodles", address=u'633 Silver Star Boulevard', state=u'ON')
c-uuAPbSpRGR0vPU