In [1]:
from pyspark.sql import SparkSession
import utils.config as config


# Step 1: Create Spark session
spark = SparkSession.builder \
    .appName("BusinessSimilarityModel") \
    .getOrCreate()

In [2]:
# Step 2: Load review data
# Assuming review_df is loaded with columns including 'name' and 'text'
df = spark.read.json(config.PHILADELPHIA)
user = spark.read.json(config.USER)

In [3]:
df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_stars: double (nullable = true)
 |-- categories: string (nullable = true)
 |-- city: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [4]:
user.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [5]:
df.show(5)

+--------------------+--------------+--------------------+------------+----+-------------------+-----+--------------------+------------+--------------------+------------+-----+--------------------+------+--------------------+
|         business_id|business_stars|          categories|        city|cool|               date|funny|                name|review_count|           review_id|review_stars|state|                text|useful|             user_id|
+--------------------+--------------+--------------------+------------+----+-------------------+-----+--------------------+------------+--------------------+------------+-----+--------------------+------+--------------------+
|-0eUa8TsXFFy0FCxH...|           4.0|Caterers, Sandwic...|Philadelphia|   0|2017-02-09 19:28:57|    0|Waterfront Gourme...|          26|diZL4qZJqLM034BJD...|         4.0|   PA|Sandwiches had go...|     0|sqkiFAnk4gmL1LYmZ...|
|-0eUa8TsXFFy0FCxH...|           4.0|Caterers, Sandwic...|Philadelphia|   0|2016-07-26 03:08:22|

In [6]:
df.describe().show()

+-------+--------------------+------------------+--------------------+------------+------------------+-------------------+------------------+--------------------+-----------------+--------------------+------------------+------+----------------------+------------------+--------------------+
|summary|         business_id|    business_stars|          categories|        city|              cool|               date|             funny|                name|     review_count|           review_id|      review_stars| state|                  text|            useful|             user_id|
+-------+--------------------+------------------+--------------------+------------+------------------+-------------------+------------------+--------------------+-----------------+--------------------+------------------+------+----------------------+------------------+--------------------+
|  count|              687307|            687307|              687307|      687307|            687307|             687307|     

In [7]:
from pyspark.sql.functions import col, sum

df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

+-----------+--------------+----------+----+----+----+-----+----+------------+---------+------------+-----+----+------+-------+
|business_id|business_stars|categories|city|cool|date|funny|name|review_count|review_id|review_stars|state|text|useful|user_id|
+-----------+--------------+----------+----+----+----+-----+----+------------+---------+------------+-----+----+------+-------+
|          0|             0|         0|   0|   0|   0|    0|   0|           0|        0|           0|    0|   0|     0|      0|
+-----------+--------------+----------+----+----+----+-----+----+------------+---------+------------+-----+----+------+-------+



In [8]:
df.groupBy("review_stars").count().orderBy("review_stars").show()

+------------+------+
|review_stars| count|
+------------+------+
|         1.0| 66626|
|         2.0| 57480|
|         3.0| 91706|
|         4.0|194373|
|         5.0|277122|
+------------+------+



In [9]:
from pyspark.sql.functions import explode, split

df.select(explode(split("categories", ", ")).alias("category")) \
  .groupBy("category").count().orderBy("count", ascending=False).show(20)


+--------------------+------+
|            category| count|
+--------------------+------+
|         Restaurants|687289|
|                Food|220033|
|           Nightlife|211502|
|                Bars|204367|
|      American (New)|153715|
|  Breakfast & Brunch|129387|
|American (Traditi...|114673|
|          Sandwiches|112915|
|             Italian| 82982|
|               Pizza| 68523|
|        Coffee & Tea| 67688|
|             Chinese| 56947|
|             Seafood| 53422|
|      Specialty Food| 44620|
|       Cocktail Bars| 42346|
|             Mexican| 42210|
|             Burgers| 41746|
|               Cafes| 41381|
|        Asian Fusion| 39592|
|                Beer| 39244|
+--------------------+------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import year, month

df.select(year("date").alias("year")).groupBy("year").count().orderBy("year").show()


+----+-----+
|year|count|
+----+-----+
|2005|  101|
|2006|  773|
|2007| 4115|
|2008|10912|
|2009|18844|
|2010|29391|
|2011|39371|
|2012|44252|
|2013|51597|
|2014|60250|
|2015|68208|
|2016|68748|
|2017|71947|
|2018|73190|
|2019|72182|
|2020|34860|
|2021|36717|
|2022| 1849|
+----+-----+



In [11]:
df.groupBy("user_id").count().orderBy("count", ascending=False).show(10)

+--------------------+-----+
|             user_id|count|
+--------------------+-----+
|ET8n-r7glWYqZhuR6...|  929|
|_BcWyKQL16ndpBdgg...|  787|
|bJ5FtCtZX3ZZacz2_...|  663|
|vFd8aBLg1kFcd0kCk...|  529|
|8EMU7d4pCkdqUnvlI...|  469|
|0DB3Irpf_ETVXu_Ou...|  453|
|ouODopBKF3AqfCkuQ...|  414|
|LnFIWZM_l__4t8Qxj...|  403|
|NNgQ8fV5ARQgHw-Ob...|  390|
|HxyLRaoH9PS09M6R3...|  386|
+--------------------+-----+
only showing top 10 rows

