In [None]:
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PY-Spark INIT')\
    .config('spark.driver.extraClassPath', 'mysql-connector-java-8.0.16.jar')\
    .config('spark.executor.extraClassPath', 'mysql-connector-java-8.0.16.jar')\
    .config('spark.jars', 'mysql-connector-java-8.0.16.jar')\
.enableHiveSupport().getOrCreate()
curse_words = spark.read.text("inappropriate_words.txt")
curse_words.show()
schema = StructType([
    StructField("publishedAt", StringType(), True),
    StructField("rating", DoubleType(), True),
    StructField("restaurantId", IntegerType(), True),
    StructField("reviewId", IntegerType(), True),
    StructField("text", StringType(), True)
])
df = spark.read.json("reviews.jsonl", schema=schema)
inappropriate_words_list = curse_words.select("value").rdd.flatMap(lambda x: x).collect()
print(inappropriate_words_list)
df.show()


In [None]:
from pyspark.sql.functions import col, udf

# Function to replace inappropriate words with asterisks
def filter_inappropriate(text):
    words = text.split()
    for i, word in enumerate(words):
        if any (inappropriate_word in word.lower() for inappropriate_word in inappropriate_words_list):
            words[i] = "****"
    return " ".join(words)

# Calculate the proportion of inappropriate words in each review

def calculate_proportion(text):
    if not text:
        return 0
    total_words = len(text.split())
    inappropriate_count = text.count("*")  # Count the asterisks representing filtered inappropriate words
    return inappropriate_count / total_words

# UDF to apply the function to the "text" column
filter_inappropriate_udf = udf(filter_inappropriate, StringType())
calculate_proportion_udf = udf(calculate_proportion, StringType())


review_data = df.withColumn("filtered_text", filter_inappropriate_udf(col("text")))\
.withColumn("inap_pct", round( calculate_proportion_udf(col("filtered_text")), 4))\
.filter(col("inap_pct") < 0.5)
review_data.show(500)

In [None]:
from pyspark.sql.window import Window
window_spec = Window.partitionBy("restaurantId", "year")
final_data = review_data.withColumn("year", date_format(to_timestamp("publishedAt", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"), "yyyy"))\
.withColumn("date", to_date(to_timestamp("publishedAt", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")))\
.withColumn("month", date_format(to_timestamp("publishedAt", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"), "yyyy-MM"))\
.withColumn("Avg Monthly Rating",round( avg("rating").over(window_spec), 4))\
.withColumn("Max Monthly Rating", max("rating").over(window_spec))\
.withColumn("Min Monthly Rating", min("rating").over(window_spec))
final_data.drop("publishedAt", "date", "month").show(500)
final_data.repartition(1).write.json("output1.json")