In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Define Spark session (if needed)
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schemas
movies_schema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("title", StringType(), True)
])

users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True)
])

movie_rating_schema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("created_at", StringType(), True)  # Using StringType to avoid date parsing issues
])

# Create DataFrames
movies_df = spark.createDataFrame([
    (1, "Avengers"),
    (2, "Frozen 2"),
    (3, "Joker")
], schema=movies_schema)

users_df = spark.createDataFrame([
    (1, "Daniel"),
    (2, "Monica"),
    (3, "Maria"),
    (4, "James")
], schema=users_schema)

movie_rating_df = spark.createDataFrame([
    (1, 1, 3, "2020-01-12"),
    (1, 2, 4, "2020-02-11"),
    (1, 3, 2, "2020-02-12"),
    (1, 4, 1, "2020-01-01"),
    (2, 1, 5, "2020-02-17"),
    (2, 2, 2, "2020-02-01"),
    (2, 3, 2, "2020-03-01"),
    (3, 1, 3, "2020-02-22"),
    (3, 2, 4, "2020-02-25")
], schema=movie_rating_schema)

# Convert 'created_at' column to DateType
movie_rating_df = movie_rating_df.withColumn("created_at", movie_rating_df.created_at.cast(DateType()))

# Create temporary views for SQL queries
movies_df.createOrReplaceTempView("Movies")
users_df.createOrReplaceTempView("Users")
movie_rating_df.createOrReplaceTempView("MovieRating")

print("Tables and views created successfully.")


Tables and views created successfully.


In [27]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [40]:
spark.sql("""select movie_id from MovieRating group by movie_id having AVG(rating) = (select max(avg_rating) from (select movie_id, AVG(rating) as avg_rating
from MovieRating 
where DATE_FORMAT(created_at, 'YYYY-MM') = '2020-02'
group by movie_id))""").show()

+--------+
|movie_id|
+--------+
|       3|
+--------+



In [21]:
spark.sql("""
with highestnoofrating as (
    select name from Users where user_id in (
    select user_id from MovieRating group by user_id  
    having count(*) = (select max(count1) from (select count(*) as count1 from MovieRating group by user_id))))


""").show()

+---------+
|min(name)|
+---------+
|   Daniel|
+---------+

