In [0]:
from pyspark.sql.functions import col, when, to_timestamp, year

df_raw = spark.table("raw_youtube.default.raw_youtube_recommendation_dataset")
df_raw.show(5)
df_raw.printSchema()


+--------------------+------------------+-------------------+-----------+----------+----------+-------------+--------------+--------+----------+-------+---------------+--------------------+-----------------------+----------------+--------------+
|               Title|     channel_title|       published_at|category_id|view_count|like_count|comment_count|favorite_count|duration|definition|caption|engagement_rate|likes_to_views_ratio|comments_to_views_ratio|duration_seconds|video_age_days|
+--------------------+------------------+-------------------+-----------+----------+----------+-------------+--------------+--------+----------+-------+---------------+--------------------+-----------------------+----------------+--------------+
|LA PERVERSA X LA ...|AlofokeMusicSounds|2025-11-16 15:34:55|         10|   1405647|    140463|         9063|             0| PT1M51S|        hd|  false|    0.106375138|       0.09992757789|         0.006447560129|             111|             1|
|Moana | Officia

In [0]:
from pyspark.sql.functions import col, when, to_timestamp, year

df_clean = (
    df_raw
    # optional: make title lowercase name for easier use
    .withColumnRenamed("Title", "title")
    # cast numeric columns
    .withColumn("view_count", col("view_count").cast("long"))
    .withColumn("like_count", col("like_count").cast("long"))
    .withColumn("comment_count", col("comment_count").cast("long"))
    .withColumn("favorite_count", col("favorite_count").cast("long"))
    .withColumn("engagement_rate", col("engagement_rate").cast("double"))
    .withColumn("likes_to_views_ratio", col("likes_to_views_ratio").cast("double"))
    .withColumn("comments_to_views_ratio", col("comments_to_views_ratio").cast("double"))
    .withColumn("duration_seconds", col("duration_seconds").cast("long"))
    .withColumn("video_age_days", col("video_age_days").cast("long"))
    # convert publish time to timestamp
    .withColumn("published_at", to_timestamp(col("published_at")))
)

# Add publish_year from published_at
df_clean = df_clean.withColumn(
    "publish_year",
    year(col("published_at"))
)

# Add popularity_level based on view_count
df_clean = df_clean.withColumn(
    "popularity_level",
    when(col("view_count") < 10_000, "Low")
    .when(col("view_count") < 100_000, "Medium")
    .otherwise("High")
)

# Add video_age_group based on video_age_days
df_clean = df_clean.withColumn(
    "video_age_group",
    when(col("video_age_days") < 30, "Last 30 days")
    .when(col("video_age_days") < 365, "Last year")
    .otherwise("Older")
)

df_clean.show(5)
df_clean.printSchema()


+--------------------+------------------+-------------------+-----------+----------+----------+-------------+--------------+--------+----------+-------+---------------+--------------------+-----------------------+----------------+--------------+------------+----------------+---------------+
|               title|     channel_title|       published_at|category_id|view_count|like_count|comment_count|favorite_count|duration|definition|caption|engagement_rate|likes_to_views_ratio|comments_to_views_ratio|duration_seconds|video_age_days|publish_year|popularity_level|video_age_group|
+--------------------+------------------+-------------------+-----------+----------+----------+-------------+--------------+--------+----------+-------+---------------+--------------------+-----------------------+----------------+--------------+------------+----------------+---------------+
|LA PERVERSA X LA ...|AlofokeMusicSounds|2025-11-16 15:34:55|         10|   1405647|    140463|         9063|             0|

In [0]:
df_clean.write.mode("overwrite").saveAsTable("raw_youtube.default.clean_youtube_recommendation_dataset")
