In [60]:
# import the PySpark module
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [61]:
# initialize the SparkContext
spark

In [62]:
# read the songs data from the CSV file
songs = spark.read.csv('../data/songs/songs.csv', header=True, inferSchema=True)

In [63]:
songs.show(10)

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|  0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   False|       0.676| 0.461|  1|  -6.746|   0|      0.143|      0.0322|         1.01E-6|   0.358|  0.715| 87.917|           4

In [64]:
# read the songs data from the CSV file
users = spark.read.csv('../data/users/users.csv', header=True, inferSchema=True)

In [65]:
users.show(10)

+-------+-----------------+--------+-------------+----------+
|user_id|        user_name|user_age| user_country|created_at|
+-------+-----------------+--------+-------------+----------+
|      1|     Norma Fisher|      65|United States|2024-02-07|
|      2|   Jorge Sullivan|      28|United States|2024-11-28|
|      3|  Elizabeth Woods|      19|United States|2024-11-16|
|      4|     Susan Wagner|      45|United States|2024-06-14|
|      5| Peter Montgomery|      61|United States|2024-07-24|
|      6| Theodore Mcgrath|      58|United States|2024-12-12|
|      7|Stephanie Collins|      68|United States|2024-04-16|
|      8| Stephanie Sutton|      53|United States|2024-05-04|
|      9|   Brian Hamilton|      34|United States|2024-09-15|
|     10|       Susan Levy|      18|United States|2024-09-08|
+-------+-----------------+--------+-------------+----------+
only showing top 10 rows



In [66]:
# read the streams data from the CSV file
streams1 = spark.read.csv('../data/streams/streams1.csv', header=True, inferSchema=True)
streams2 = spark.read.csv('../data/streams/streams2.csv', header=True, inferSchema=True)
streams3 = spark.read.csv('../data/streams/streams3.csv', header=True, inferSchema=True)

In [67]:
# concat the streams data
streams = streams1.union(streams2).union(streams3)
streams.show(10)


+-------+--------------------+-------------------+
|user_id|            track_id|        listen_time|
+-------+--------------------+-------------------+
|  26213|4dBa8T7oDV9WvGr7k...|2024-06-25 17:43:13|
|   6937|4osgfFTICMkcGbbig...|2024-06-25 07:26:00|
|  21407|2LoQWx41KeqOrSFra...|2024-06-25 13:25:26|
|  47146|7cfG5lFeJWEgpSnub...|2024-06-25 18:17:50|
|  38594|6tilCYbheGMHo3Hw4...|2024-06-25 17:33:21|
|  14209|2QuOheWJqShIBIYC1...|2024-06-25 02:52:20|
|  26986|6qBSGvyUzqNQv8Xtn...|2024-06-25 22:32:51|
|   8173|1wXSL0SAzd7mX0LM8...|2024-06-25 11:59:10|
|  12950|0L7Nv6ToXLRAWId4e...|2024-06-25 17:54:30|
|   2898|7tnE9vy6FCRtbZql5...|2024-06-25 18:30:31|
+-------+--------------------+-------------------+
only showing top 10 rows



In [68]:
streams.describe().show()

+-------+------------------+--------------------+
|summary|           user_id|            track_id|
+-------+------------------+--------------------+
|  count|             34038|               34038|
|   mean|24934.808420001176|                NULL|
| stddev|14444.125256146368|                NULL|
|    min|                 3|0000vdREvCVMxbQTk...|
|    max|             49999|7zxpdh3EqMq2JCkOI...|
+-------+------------------+--------------------+



In [69]:
songs.describe().show()

+-------+-----------------+--------------------+------------------+----------------------------+--------------+------------------+--------------------+------------------+-----------------+--------------------+-----------------+------------------+----------------+------------------+-------------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+
|summary|               id|            track_id|           artists|                  album_name|    track_name|        popularity|         duration_ms|          explicit|     danceability|              energy|              key|          loudness|            mode|       speechiness|       acousticness|  instrumentalness|         liveness|            valence|            tempo|    time_signature|       track_genre|
+-------+-----------------+--------------------+------------------+----------------------------+--------------+------------------+--------------------+-----------------

In [70]:
users.describe().show()

+-------+-----------------+--------------+------------------+-------------+
|summary|          user_id|     user_name|          user_age| user_country|
+-------+-----------------+--------------+------------------+-------------+
|  count|            50000|         50000|             50000|        50000|
|   mean|          25000.5|          NULL|          43.56998|         NULL|
| stddev|14433.90106658626|          NULL|14.996324902949087|         NULL|
|    min|                1|Aaron Alvarado|                18|    Australia|
|    max|            50000|   Zoe Walters|                69|United States|
+-------+-----------------+--------------+------------------+-------------+



In [71]:
# merging the users data with the songs data
song_users = songs.join(users, songs.id == users.user_id, 'inner')

In [72]:
song_users.describe().show()

+-------+------------------+--------------------+--------------------+-----------------+--------------+--------------------+--------------------+--------------------+-----------------+--------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+--------------+------------------+-------------+
|summary|                id|            track_id|             artists|       album_name|    track_name|          popularity|         duration_ms|            explicit|     danceability|              energy|               key|          loudness|              mode|        speechiness|      acousticness|   instrumentalness|           liveness|            valence|             tempo|    time_signature|       track_genre|           user_id|     user_name|          user_age| user_country|
+-------+------------------+

In [73]:
song_users.columns

['id',
 'track_id',
 'artists',
 'album_name',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre',
 'user_id',
 'user_name',
 'user_age',
 'user_country',
 'created_at']

In [74]:
listen_count = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                 .agg(F.count("track_id").alias("listen_count"))
listen_count.show()

+-----------+----------+------------+
|track_genre|      date|listen_count|
+-----------+----------+------------+
|   acoustic|2024-05-20|           5|
|   acoustic|2024-08-13|           2|
|   afrobeat|2024-11-23|           2|
|   alt-rock|2024-08-26|           2|
|   alt-rock|2024-09-25|           3|
|   alt-rock|2024-06-03|           4|
|alternative|2024-10-31|           8|
|alternative|2024-09-04|           1|
|alternative|2024-02-11|           2|
|alternative|2024-01-18|           1|
|    ambient|2024-11-06|           7|
|    ambient|2024-08-08|           3|
|      anime|2024-11-02|           4|
|      anime|2024-04-09|           6|
|black-metal|2024-04-10|           3|
|  bluegrass|2024-03-14|           2|
|  bluegrass|2024-04-22|           1|
|    british|2024-10-11|           3|
|   cantopop|2024-02-26|           4|
|   cantopop|2024-04-11|           1|
+-----------+----------+------------+
only showing top 20 rows



In [75]:
unique_listeners = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                     .agg(F.countDistinct("user_id").alias("unique_listeners"))
unique_listeners.show()

+-----------+----------+----------------+
|track_genre|      date|unique_listeners|
+-----------+----------+----------------+
|  bluegrass|2024-04-22|               1|
|      anime|2024-11-02|               4|
|     comedy|2024-03-19|               4|
|     comedy|2024-02-13|               1|
|   cantopop|2024-02-26|               4|
|       club|2024-02-03|               5|
|   alt-rock|2024-09-25|               3|
|   afrobeat|2024-11-23|               2|
|     comedy|2024-02-26|               5|
|      anime|2024-04-09|               6|
|   alt-rock|2024-06-03|               4|
|alternative|2024-10-31|               8|
|    ambient|2024-11-06|               7|
|   cantopop|2024-04-11|               1|
|   acoustic|2024-05-20|               5|
|      anime|2024-08-24|               2|
|      disco|2024-09-23|               4|
|   children|2024-02-03|               2|
|   cantopop|2024-01-06|               4|
|alternative|2024-01-18|               1|
+-----------+----------+----------

In [76]:
total_listening_time = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                         .agg(F.sum("duration_ms").alias("total_listening_time"))
total_listening_time.show()

+-----------+----------+--------------------+
|track_genre|      date|total_listening_time|
+-----------+----------+--------------------+
|   acoustic|2024-05-20|           1096693.0|
|   acoustic|2024-08-13|            288280.0|
|   afrobeat|2024-11-23|            782559.0|
|   alt-rock|2024-08-26|            437950.0|
|   alt-rock|2024-09-25|            653786.0|
|   alt-rock|2024-06-03|            948265.0|
|alternative|2024-10-31|           1983925.0|
|alternative|2024-09-04|            170771.0|
|alternative|2024-02-11|            287121.0|
|alternative|2024-01-18|            234760.0|
|    ambient|2024-11-06|           1938263.0|
|    ambient|2024-08-08|            744525.0|
|      anime|2024-11-02|            600424.0|
|      anime|2024-04-09|           1315769.0|
|black-metal|2024-04-10|            565733.0|
|  bluegrass|2024-03-14|            419593.0|
|  bluegrass|2024-04-22|             73000.0|
|    british|2024-10-11|            611801.0|
|   cantopop|2024-02-26|          

In [77]:
avg_listening_time_per_user = total_listening_time.join(unique_listeners, ["track_genre", "date"]) \
                                                  .withColumn("avg_listening_time_per_user", 
                                                              F.col("total_listening_time") / F.col("unique_listeners"))
avg_listening_time_per_user.show()

+-----------+----------+--------------------+----------------+---------------------------+
|track_genre|      date|total_listening_time|unique_listeners|avg_listening_time_per_user|
+-----------+----------+--------------------+----------------+---------------------------+
|  bluegrass|2024-04-22|             73000.0|               1|                    73000.0|
|      anime|2024-11-02|            600424.0|               4|                   150106.0|
|     comedy|2024-03-19|            959017.0|               4|                  239754.25|
|     comedy|2024-02-13|            415320.0|               1|                   415320.0|
|   cantopop|2024-02-26|            917196.0|               4|                   229299.0|
|       club|2024-02-03|            700923.0|               5|                   140184.6|
|   alt-rock|2024-09-25|            653786.0|               3|         217928.66666666666|
|   afrobeat|2024-11-23|            782559.0|               2|                   391279.5|

In [78]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Ensure `track_genre` is correctly cast as a string
song_users = song_users.withColumn("track_genre", F.col("track_genre").cast("string"))

# Compute listen count per song per genre per day
song_listen_count = song_users.groupBy(
    "track_genre", 
    F.to_date("created_at").alias("date"), 
    "track_name"
).agg(F.count("track_id").alias("listen_count"))

# Define a ranking window for top songs in each genre per day
song_rank_window = Window.partitionBy("track_genre", "date").orderBy(F.desc("listen_count"))

# Rank songs and filter for the top 3
top_songs_per_genre = song_listen_count.withColumn("rank", F.rank().over(song_rank_window)) \
                                       .filter(F.col("rank") <= 3)

top_songs_per_genre.show()

+-----------+----------+--------------------+------------+----+
|track_genre|      date|          track_name|listen_count|rank|
+-----------+----------+--------------------+------------+----+
|      0.576|2024-10-30|          Snuff Crew|           1|   1|
|    105.188|2024-01-21|"12 Variations in...|           1|   1|
|    114.211|2024-05-18|"12 Variations in...|           1|   1|
|     117.11|2024-10-03|"12 Variations in...|           1|   1|
|    125.262|2024-06-24|"12 Variations in...|           1|   1|
|    148.759|2024-10-19|"12 Variations in...|           1|   1|
|    151.539|2024-11-10|"12 Variations in...|           1|   1|
|          3|2024-02-24|"Christmas Orator...|           1|   1|
|          3|2024-03-22|"6 Variations on ...|           1|   1|
|          3|2024-06-05|"Christmas Orator...|           1|   1|
|          3|2024-11-16|"Christmas Orator...|           1|   1|
|          3|2024-11-22|"7 Variations on ...|           1|   1|
|          3|2024-12-30|"Sonata No. 14 "

In [79]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Ensure track_genre is correctly cast as a string
song_users = song_users.withColumn("track_genre", F.col("track_genre").cast("string"))

# Step 1: Compute listen count per genre per day
genre_listen_count = song_users.groupBy(
    F.to_date("created_at").alias("date"),
    "track_genre"
).agg(F.count("track_id").alias("listen_count"))

# Step 2: Define a ranking window for top genres per day
genre_rank_window = Window.partitionBy("date").orderBy(F.desc("listen_count"))

# Step 3: Apply ranking and filter for the top 5 genres per day
top_genres_per_day = genre_listen_count.withColumn("rank", F.rank().over(genre_rank_window)) \
                                       .filter(F.col("rank") <= 5)

top_genres_per_day.describe().show()


+-------+-----------+------------------+------------------+
|summary|track_genre|      listen_count|              rank|
+-------+-----------+------------------+------------------+
|  count|       2772|              2772|              2772|
|   mean|       NULL| 5.091991341991342| 2.884199134199134|
| stddev|       NULL|1.0386855307680234|1.4243624343221657|
|    min|   acoustic|                 4|                 1|
|    max|  hardstyle|                10|                 5|
+-------+-----------+------------------+------------------+



In [80]:
final_kpi_table = listen_count.join(unique_listeners, ["track_genre", "date"]) \
                              .join(total_listening_time, ["track_genre", "date"]) \
                              .join(avg_listening_time_per_user, ["track_genre", "date"]) \
                              .join(top_songs_per_genre, ["track_genre", "date"], "left") \
                              .join(top_genres_per_day, ["track_genre", "date"], "left")
final_kpi_table.show()

+-----------+----------+------------+----------------+--------------------+--------------------+----------------+---------------------------+--------------------+------------+----+------------+----+
|track_genre|      date|listen_count|unique_listeners|total_listening_time|total_listening_time|unique_listeners|avg_listening_time_per_user|          track_name|listen_count|rank|listen_count|rank|
+-----------+----------+------------+----------------+--------------------+--------------------+----------------+---------------------------+--------------------+------------+----+------------+----+
|      0.576|2024-10-30|           1|               1|                NULL|                NULL|               1|                       NULL|          Snuff Crew|           1|   1|        NULL|NULL|
|    105.188|2024-01-21|           1|               1|                NULL|                NULL|               1|                       NULL|"12 Variations in...|           1|   1|        NULL|NULL|
|    