In [1]:
import os
import utils.config as config
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
os.environ['SPARK_HOME'] = config.APP
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'

In [3]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("YelpDataLoader") \
    .getOrCreate()

In [4]:
# Load business data
business = spark.read.json(config.BUSINESS_DATA_PATH)

# Load review data
review = spark.read.json(config.REVIEW_DATA_PATH)

In [5]:
# Return column names from the business DataFrame
print(business.columns)

['address', 'attributes', 'business_id', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'postal_code', 'review_count', 'stars', 'state']


In [6]:
# Return column names from the review DataFrame
print(review.columns)

['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text', 'useful', 'user_id']


In [7]:
# Filter businesses where 'categories' contains 'Restaurant' AND city is 'Philadelphia'
business_restaurants = business.filter(
    col("categories").contains("Restaurant")
).filter(
    col("city") == "Philadelphia"
)

In [8]:
# Optional: Select only required columns to optimize memory
business_selected = business_restaurants.select(
    "business_id", "name", "city", "state", "categories", "stars", "review_count"
).withColumnRenamed("stars", "business_stars")  # Rename conflicting column

review_selected = review.select(
    "business_id", "review_id", "user_id", "stars", "text", "date", "useful", "funny", "cool"
).withColumnRenamed("stars", "review_stars")  # Rename conflicting column

# Perform left join on business_id
yelp_restaurant = business_selected.join(review_selected, on="business_id", how="left")

# Optional: Repartition to reduce output files (can tweak number)
yelp_restaurant = yelp_restaurant.repartition(1)

# Save to Json
yelp_restaurant.write.mode("overwrite").option("header", True).json(config.PHILADELPHIA)

In [9]:
review = spark.read.json(config.PHILADELPHIA)

print(review.columns)

['business_id', 'business_stars', 'categories', 'city', 'cool', 'date', 'funny', 'name', 'review_count', 'review_id', 'review_stars', 'state', 'text', 'useful', 'user_id']


In [10]:
user = spark.read.json(config.USER)

print(user.columns)

['average_stars', 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'elite', 'fans', 'friends', 'funny', 'name', 'review_count', 'useful', 'user_id', 'yelping_since']


In [11]:
spark.stop()