# Exploratory data analysis

A brief exploratory analysis is conducted on the partitioned parquets and cleaned data to learn more about the data, aggregations and quality issues.

In [1]:
# Import libraries
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import (
    split, col, explode, to_date, unix_timestamp, 
    to_timestamp, year, month, dayofmonth, dayofweek, 
    hour, minute
)

In [2]:
# Setup spark environement
from pyspark.sql import SparkSession
spark = (
    SparkSession
        .builder
        .appName("eda")
        .config("spark.driver.memory", "8g")
        .getOrCreate()
    
)

## Business

In [8]:
sdf_business = spark.read.parquet("data/output/yelp_academic_dataset_business.parquet")

In [10]:
sdf_business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- attributes_AcceptsInsurance: string (nullable = true)
 |-- attributes_AgesAllowed: string (nullable = true)
 |-- attributes_Alcohol: string (nullable = true)
 |-- attributes_Ambience: string (nullable = true)
 |-- attributes_BYOB: string (nullable = true)
 |-- attributes_BYOBCorkage: string (nullable = true)
 |-- attributes_BestNights: string (nullable = true)
 |-- attributes_BikeParking: string (nullable = true)
 |-- attributes_BusinessAcceptsBitcoin: string (nullable = true)
 |-- attributes_BusinessAcceptsCred

* Number of observations by metropolitan area.

In [11]:
sdf_business.createOrReplaceTempView("business")

In [18]:
spark.sql(
    """
    SELECT 
        cluster,
        metropolitan_area,
        COUNT(metropolitan_area) AS n 
    FROM business
    GROUP BY cluster, metropolitan_area
    ORDER BY n DESC
    """).show(10)

+-------+--------------------+-----+
|cluster|   metropolitan_area|    n|
+-------+--------------------+-----+
|      3|Cambridge, Masach...|36021|
|      2|Portland, Oregon ...|28301|
|      4|       Austin, Texas|24487|
|      5|    Orlando, Florida|21913|
|      6|    Atlanta, Georgia|18094|
|      7|Vancouver, Britis...|17305|
|      1|      Columbus, Ohio|11262|
|      0|    Denver, Colorado| 3201|
|      8|Cambridge, Masach...|    1|
+-------+--------------------+-----+



- Sample of 10 observations in the Cambridge metropolitan area.

In [19]:
spark.sql(
    """
    SELECT 
        metropolitan_area, 
        categories, 
        business_id, 
        name 
    FROM business
    WHERE cluster = 3
    LIMIT 10
    """).show()

+--------------------+--------------------+--------------------+--------------------+
|   metropolitan_area|          categories|         business_id|                name|
+--------------------+--------------------+--------------------+--------------------+
|Cambridge, Masach...|Wigs, Hair Extens...|hCABMnKtwo4Y9alQD...|Star Kreations Sa...|
|Cambridge, Masach...|Food, Pizza, Rest...|HPA_qyMEddpAEtFof...| Mr G's Pizza & Subs|
|Cambridge, Masach...|Specialty Schools...|6fT0lYr_UgWSCZs_w...|       Salter School|
|Cambridge, Masach...|Restaurants, Shop...|hcRxdDg7DYryCxCoI...|   Longwood Galleria|
|Cambridge, Masach...|Sandwiches, Food,...|jGennaZUr2MsJyRhi...|     Legal Sea Foods|
|Cambridge, Masach...|Health & Medical,...|XlLPnkHkm0Q7NLQrA...|    Nancy Wilde, LMT|
|Cambridge, Masach...|Playgrounds, Acti...|vOMGjOSKtAKWWW-F_...|Brewer/Burroughs ...|
|Cambridge, Masach...|Creperies, Restau...|iPD8BBvea6YldQZPH...|     Espresso Minute|
|Cambridge, Masach...|Pet Services, Pet...|D2VwAQBBxfm

- Sample of categories.

In [31]:
sdf_business.select('categories').show(2, truncate = False)

+-------------------------------------------------------------------------------------------------+
|categories                                                                                       |
+-------------------------------------------------------------------------------------------------+
|Wigs, Hair Extensions, Hair Salons, Blow Dry/Out Services, Hair Stylists, Beauty & Spas, Shopping|
|Food, Pizza, Restaurants                                                                         |
+-------------------------------------------------------------------------------------------------+
only showing top 2 rows



## Business with most reviews and average stars by year

In [32]:
sdf_reviews = spark.read.parquet("data/output/yelp_academic_dataset_review.parquet")

- Sample of reviews data.

In [37]:
sdf_reviews.createOrReplaceTempView("reviews")

In [38]:
spark.sql("SELECT * FROM reviews LIMIT 5").show()

+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+-------------------+-----+-------+--------------------+----+
|         business_id|cool|      date|funny|           review_id|stars|                text|useful|             user_id|                 ts|month|cluster|   metropolitan_area|year|
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+-------------------+-----+-------+--------------------+----+
|Q2TyugW7F2OzIwUV2...|   0|2019-09-07|    0|1whFwxVBGhmxL67xs...|  1.0|Found their plant...|     3|xVxH9e4-4uwb5eCI2...|2019-09-07 16:59:10|    9|      3|Cambridge, Masach...|2019|
|KCU9Sc7l5qsr_eD7J...|   0|2019-08-23|    0|JpxeDl3r4iKnAAFcr...|  1.0|Do not join this ...|     6|j9wHDTA2nyobUBbXj...|2019-08-23 15:10:18|    8|      3|Cambridge, Masach...|2019|
|8tP7pJ7L08gW5oVf4...|   0|2019-06-10|    0|6cBwa7680EBJia9xo...|  5.0|Green Street Stud...|   

- Sample of aggregated number of reviews and average stars.

In [42]:
spark.sql(
    """
    SELECT 
        cluster, business_id, year,
        COUNT(review_id) AS n_reviews,
        AVG(stars) AS avg_stars
    FROM reviews
    GROUP BY cluster, business_id, year
    """
).show()

+-------+--------------------+----+---------+------------------+
|cluster|         business_id|year|n_reviews|         avg_stars|
+-------+--------------------+----+---------+------------------+
|      3|Bz0DP2_fqiqVr71TJ...|2019|        8|               5.0|
|      3|69vSJWuH1P5PHIUKe...|2019|        4|               5.0|
|      3|MTMmKZ4vrV2yQWv49...|2019|        8|               2.5|
|      3|15Vv8Qu-s88jdo4bO...|2019|        8|               3.0|
|      3|QEgr1OlOTM2IMOffW...|2019|        3|1.3333333333333333|
|      3|JPUYmvHckHv52FIiQ...|2019|        2|               5.0|
|      3|wg-c2a32AYw0GSAwW...|2019|        2|               4.5|
|      3|ln1WDofTbNGKGBcA6...|2019|        1|               4.0|
|      3|wtCq0SMdbP8ZNaw8S...|2019|        2|               5.0|
|      3|F-D7rT6vK93dYb_cN...|2019|        3|               1.0|
|      3|nsRsHn76FjAnTcTrY...|2019|        2|               3.0|
|      3|SsBcBPDTdij-COHsg...|2019|        5|               4.8|
|      5|wrExYMc1FFDjp2lK

## Business with most checkins

- Sample of checkins.

In [43]:
sdf_checkin = spark.read.parquet("data/output/yelp_academic_dataset_checkin.parquet")

In [44]:
sdf_checkin.createOrReplaceTempView("checkin")

In [45]:
spark.sql("SELECT * FROM checkin LIMIT 5").show()

+--------------------+----------+--------------------+-------------------+-----+---+---------+----+------+-------+--------------------+----+
|         business_id|      date|               date2|                 ts|month|day|dayofweek|hour|minute|cluster|   metropolitan_area|year|
+--------------------+----------+--------------------+-------------------+-----+---+---------+----+------+-------+--------------------+----+
|K722e1j-5oGqRcKXO...|2015-01-01| 2015-01-01 05:41:57|2015-01-01 05:41:57|    1|  1|        5|   5|    41|      2|Portland, Oregon ...|2015|
|FoIiSjDxJ19XRH5gH...|2015-01-08| 2015-01-08 12:48:30|2015-01-08 12:48:30|    1|  8|        5|  12|    48|      2|Portland, Oregon ...|2015|
|K722e1j-5oGqRcKXO...|2015-01-01| 2015-01-01 06:25:22|2015-01-01 06:25:22|    1|  1|        5|   6|    25|      2|Portland, Oregon ...|2015|
|FoIiSjDxJ19XRH5gH...|2015-04-03| 2015-04-03 12:43:21|2015-04-03 12:43:21|    4|  3|        6|  12|    43|      2|Portland, Oregon ...|2015|
|K722e1j-5oGq

In [48]:
spark.sql(
    """
    SELECT 
        cluster,
        business_id,
        year,
        COUNT(*) AS n_checkin
    FROM checkin 
    GROUP BY cluster, business_id, year 
    """
).show()

+-------+--------------------+----+---------+
|cluster|         business_id|year|n_checkin|
+-------+--------------------+----+---------+
|      2|KDerPgfAZePKxu-sY...|2015|       93|
|      2|KMu7yG54tGVDh_b7V...|2015|       20|
|      2|J046_FLK5j2cIVx8v...|2015|      108|
|      2|KpXGfV5ux5cYLvMYN...|2015|        4|
|      2|LENZ1Vo2ecXwmFbEc...|2015|       14|
|      2|Eye5yKgBflxK92B9_...|2015|        8|
|      2|HfJwqKZAYtstHUTDz...|2015|        5|
|      2|GSDQWamRrcFiNUgIL...|2015|       35|
|      2|GbmddUOKysuo2C36C...|2015|        3|
|      2|JarPA4IR-RKEEU0TJ...|2015|        6|
|      2|JdJ1pd0XEdxzgQo7d...|2015|        3|
|      2|K3fAPfeShRHlLVWru...|2015|        4|
|      2|LT7e-DuKRDIApNoz5...|2015|        2|
|      2|LcDe4KGatn-aXCwvx...|2015|        1|
|      2|HEhoEVtp2CrpkZ65a...|2015|       25|
|      2|EKai4ZIR75S9hKHXV...|2015|        1|
|      2|Kuo5Pfrj7ngK6y30c...|2014|        2|
|      2|Evg5cP5MpZgHhGOcF...|2014|        7|
|      2|GzuWpn697Cz-VBPCU...|2014

In [51]:
spark.sql(
    """
    SELECT 
        cluster,
        business_id,
        year,
        COUNT(*) AS n_checkin
    FROM checkin 
    GROUP BY cluster, business_id, year 
    """
).count()

763398

In [None]:
spark.stop()