As an analyst at Amazon, you are responsible for ensuring the integrity of product ratings on the platform. Fake ratings can distort the perception of product quality and mislead customers. To maintain trust and reliability, you need to identify potential fake ratings that deviate significantly from the average ratings for each product.
Write an SQL query to identify the single rating that is farthest (in absolute value) from the average rating value for each product, display rating details in ascending order of rating id.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import DecimalType
from pyspark.sql.functions import col, avg, abs


# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Fake Ratings Table") \
    .getOrCreate()

# Define the schema
schema = StructType([
    StructField("rating_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("rating", FloatType(), True)
])

# Define the data
data = [
    (1, 101, 1001, 4.5),
    (2, 101, 1002, 4.8),
    (3, 101, 1003, 4.9),
    (4, 101, 1004, 5.0),
    (5, 101, 1005, 3.2),
    (6, 102, 1006, 4.7),
    (7, 102, 1007, 4.0),
    (8, 102, 1008, 4.1),
    (9, 102, 1009, 3.8),
    (10, 102, 1010, 3.9)
]

# Create the DataFrame
fake_ratings = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
fake_ratings.show()


+---------+----------+-------+------+
|rating_id|product_id|user_id|rating|
+---------+----------+-------+------+
|        1|       101|   1001|   4.5|
|        2|       101|   1002|   4.8|
|        3|       101|   1003|   4.9|
|        4|       101|   1004|   5.0|
|        5|       101|   1005|   3.2|
|        6|       102|   1006|   4.7|
|        7|       102|   1007|   4.0|
|        8|       102|   1008|   4.1|
|        9|       102|   1009|   3.8|
|       10|       102|   1010|   3.9|
+---------+----------+-------+------+



In [0]:

#window spec for avg_rating
window_spec=Window.partitionBy(col("product_id"))
#window spec for row_number
window_spec_dis=Window.partitionBy(col("product_id")).orderBy(col("diff_rating").desc())

#abs(rating-avg_rating)
#findout most discerpency rating within each product_id

fake_ratings.withColumn("avg_rating", avg(col("rating")).over(window_spec)) \
    .withColumn("diff_rating", abs(col("rating") - col("avg_rating")).cast(DecimalType(10, 4))) \
        .withColumn("discprency_rating",row_number().over(window_spec_dis)) \
            .filter(col("discprency_rating")==1) \
                .select("rating_id","product_id","user_id","rating").display()



rating_id,product_id,user_id,rating
5,101,1005,3.2
6,102,1006,4.7


In [0]:
window_spec=Window.partitionBy(col("rating_id"))