# Data model

In [97]:
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import split, col, explode, to_date, unix_timestamp, to_timestamp, year, month, dayofmonth, dayofweek, hour, minute

In [98]:
from pyspark.sql import SparkSession
spark = (
    SparkSession
        .builder
        .appName("eda")
        .config("spark.driver.memory", "8g")
        .getOrCreate()
    
)

In [99]:
sdf_business = spark.read.parquet("data/output/yelp_academic_dataset_business.parquet")
sdf_reviews = spark.read.parquet("data/output/yelp_academic_dataset_review.parquet")
sdf_tip = spark.read.parquet("data/output/yelp_academic_dataset_tip.parquet")
sdf_checkin = spark.read.parquet("data/output/yelp_academic_dataset_checkin.parquet")

In [100]:
def fun_preprocessing_summary(sdf):
    return (
        sdf.agg(
            F.count('business_id').alias('num_obs'), 
            F.min(sdf.year).alias('min_year'), 
            F.max(sdf.year).alias('max_year'))
        .show())

In [101]:
sdf_business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- attributes_AcceptsInsurance: string (nullable = true)
 |-- attributes_AgesAllowed: string (nullable = true)
 |-- attributes_Alcohol: string (nullable = true)
 |-- attributes_Ambience: string (nullable = true)
 |-- attributes_BYOB: string (nullable = true)
 |-- attributes_BYOBCorkage: string (nullable = true)
 |-- attributes_BestNights: string (nullable = true)
 |-- attributes_BikeParking: string (nullable = true)
 |-- attributes_BusinessAcceptsBitcoin: string (nullable = true)
 |-- attributes_BusinessAcceptsCred

## Create number of business checkins table

* `sdf_checkings_by_year`: Number of checkins in 2020.

In [102]:
# Business aggregated checkins by year
sdf_checkings_by_year = (
    sdf_checkin.groupBy(['cluster','business_id','year'])
        .agg(F.count('business_id')
        .alias('num_checkins'))
        .filter(F.col("year") == 2020))
sdf_checkings_by_year.show(10)

+-------+--------------------+----+------------+
|cluster|         business_id|year|num_checkins|
+-------+--------------------+----+------------+
|      5|eJF-qHXo_r1rTaGQ1...|2020|           1|
|      5|fYx8oy_w2o62wOrML...|2020|           4|
|      5|fkH77-PQGtEz34w4T...|2020|          30|
|      5|ge53koknoHRCYPY-2...|2020|          20|
|      5|hx4kz5uRByzPUUxQ1...|2020|          23|
|      5|jtb0k-rGm7jEQBlKR...|2020|           9|
|      5|0IltPflRgUiQj5Psa...|2020|           1|
|      5|0cu6YGlT-t80iH--C...|2020|          56|
|      5|0uBLApPO7Vo91ECDg...|2020|           2|
|      5|2tDN5AwDlapzub3S5...|2020|           2|
+-------+--------------------+----+------------+
only showing top 10 rows



- Number of observations in the window year.

In [103]:
fun_preprocessing_summary(sdf_checkings_by_year)

+-------+--------+--------+
|num_obs|min_year|max_year|
+-------+--------+--------+
|  55425|    2020|    2020|
+-------+--------+--------+



## Create number of business tips table

* `sdf_tips_by_year`: Number of tips per business in 2020

In [104]:
# Business aggregated tips by year
sdf_tips_by_year = (
    sdf_tip
        .groupBy(['cluster','business_id','year'])
        .agg(F.count('business_id').alias('num_tips'))
        .filter(F.col("year") == 2020)
)
sdf_tips_by_year.show(10)

+-------+--------------------+----+--------+
|cluster|         business_id|year|num_tips|
+-------+--------------------+----+--------+
|      1|mFbv0sujwZe2YTubu...|2020|       5|
|      1|SrTk40kI-kE_BZIWJ...|2020|       1|
|      1|NsYAEdSCD9M65TzsU...|2020|       5|
|      5|PazxNZjNV0VXR7rzG...|2020|       9|
|      5|5h4wdgWQv7JoFAKbZ...|2020|       4|
|      5|ZBkz87uk81ZNhjJIj...|2020|       2|
|      5|9CySyrNc61P26GYKO...|2020|       1|
|      5|YFud540A5RLLg2OT-...|2020|       1|
|      5|SrykgwUBIaqNwqvu7...|2020|       1|
|      5|rWvZ4wF1Oiu07vS6W...|2020|       3|
+-------+--------------------+----+--------+
only showing top 10 rows



* Number of observations ion 2020

In [105]:
fun_preprocessing_summary(sdf_tips_by_year)

+-------+--------+--------+
|num_obs|min_year|max_year|
+-------+--------+--------+
|  18604|    2020|    2020|
+-------+--------+--------+



## Create number of business reviews table

* `sdf_reviews_by_year`: Number of reviews and average reviews of businesses in 2020

In [106]:
# Business reviews by year 
sdf_reviews_by_year = (
    sdf_reviews
        .groupBy(['cluster','business_id','year'])
        .agg(
            F.count('business_id').alias('num_reviews'), 
            F.round( F.mean('stars'), 1).alias('mean_stars_reviews'))
        .filter(F.col("year") == 2020)
)
sdf_reviews_by_year.show(10)

+-------+--------------------+----+-----------+------------------+
|cluster|         business_id|year|num_reviews|mean_stars_reviews|
+-------+--------------------+----+-----------+------------------+
|      5|9CySyrNc61P26GYKO...|2020|          8|               5.0|
|      5|SSWEDe8DW-eGJdbe4...|2020|         26|               2.7|
|      5|Qe9qY1CdEQVn-vmv6...|2020|          1|               3.0|
|      5|lfHvNR9oNEAZlWSDi...|2020|          3|               1.7|
|      5|mZ-WU4y6YWOPhQiT0...|2020|          2|               1.5|
|      5|nAmn8eskp8elbi3qD...|2020|          1|               4.0|
|      5|eJF-qHXo_r1rTaGQ1...|2020|          1|               1.0|
|      3|-jxEILUSqsWL0Oo7c...|2020|         67|               4.6|
|      3|p8bGM_4JYsLF-W8Jf...|2020|          5|               5.0|
|      3|wr0C8Xphc9i1vStcn...|2020|          1|               1.0|
+-------+--------------------+----+-----------+------------------+
only showing top 10 rows



* Number of businesses with reviews in 2020.

In [107]:
fun_preprocessing_summary(sdf_reviews_by_year)

+-------+--------+--------+
|num_obs|min_year|max_year|
+-------+--------+--------+
|  89770|    2020|    2020|
+-------+--------+--------+



In [108]:
sdf_business.select(['cluster', 'metropolitan_area', 'business_id', 'name', 'review_count', 'stars']).count()

160585

- `sdf_business_union`: Join to `sdf_business` the following tables:
    + `sdf_checkings_by_year`
    + `sdf_tips_by_year`
    + `sdf_reviews_by_year`

## Create business union table

In [133]:
sdf_business_union = (
    # Business list
    sdf_business
        .select([
            'cluster', 
            'metropolitan_area', 
            'categories', 
            'business_id', 
            'name',
            'latitude',
            'longitude',
            'review_count', 
            'stars'])
        # Join number of business checkins
        .join(sdf_checkings_by_year, on = ['cluster','business_id'], how = 'left')
        # Join number of business tips
        .join(sdf_tips_by_year, on = ['cluster','business_id'], how = 'left')
        # Join the number of business reviews
        .join(sdf_reviews_by_year, on = ['cluster','business_id'], how = 'left')
)
sdf_business_union.show(10)

+-------+--------------------+-----------------+--------------------+--------------------+-------------+--------------+------------+-----+----+------------+----+--------+----+-----------+------------------+
|cluster|         business_id|metropolitan_area|          categories|                name|     latitude|     longitude|review_count|stars|year|num_checkins|year|num_tips|year|num_reviews|mean_stars_reviews|
+-------+--------------------+-----------------+--------------------+--------------------+-------------+--------------+------------+-----+----+------------+----+--------+----+-----------+------------------+
|      0|-uVhjP6dd0yc7NBni...| Denver, Colorado|restaurants, soup...|     Gyro & Teriyaki|    33.936583|    -84.377824|          89|  2.0|2020|           4|2020|       1|2020|          3|               2.0|
|      0|0aWgecomqCKN5HZK6...| Denver, Colorado|police department...|Atlanta Police De...|   33.8401005|   -84.3722226|           8|  4.0|null|        null|null|    null|20

In [134]:
sdf_business_union.printSchema()

root
 |-- cluster: long (nullable = true)
 |-- business_id: string (nullable = true)
 |-- metropolitan_area: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_checkins: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_tips: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_reviews: long (nullable = true)
 |-- mean_stars_reviews: double (nullable = true)



* Number of rows in business_union

In [135]:
sdf_business_union.count()

160585

* Show the top 1000 business categories.

## Create a Restaurants Catalog

Lets create a catalog to classify business categories in resutrants or others.

Steps:
1. From `sdf_business_union` extract the business `categories` field, count and sort in descending order.
2. Save table to CSV file
3. Manually in a spreadsheet application label `is_restaurant`.

In [112]:
# Save business categories to csv.
df_business_categories = (
    sdf_business_union
        .groupBy('categories')
        .count()
        .sort('count', ascending = False)
        .toPandas()
)
df_business_categories.to_csv('data/preprocessing/grouped_categories.csv', index=False)

## Create restaurants table

Read manually classified catalog of restaurants.

In [136]:
sdf_business_categories_catalog = spark.read.csv("data/preprocessing/grouped_categories_catalog.csv", header = True)

In [137]:
# Filter restaurant businesses
sdf_restaurants = (
    sdf_business_union
        .join(sdf_business_categories_catalog.select(['categories', 'is_restrautrant']), on = ['categories'], how = 'left')
        .filter(col('is_restrautrant') == 1)
)

In [138]:
sdf_restaurants.count()

66013

In [140]:
sdf_restaurants.printSchema()

root
 |-- categories: string (nullable = true)
 |-- cluster: long (nullable = true)
 |-- business_id: string (nullable = true)
 |-- metropolitan_area: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_checkins: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_tips: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- num_reviews: long (nullable = true)
 |-- mean_stars_reviews: double (nullable = true)
 |-- is_restrautrant: string (nullable = true)



In [139]:
# Save restaurants
sdf_restaurants.toPandas().to_csv("data/output/restaurants.csv", index = False)

In [96]:
spark.stop()