In [1]:
# import the PySpark module
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [2]:
# initialize the SparkContext
spark

In [67]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [3]:
# read the songs data from the CSV file
songs = spark.read.csv('../data/songs/songs.csv', header=True, inferSchema=True)

In [4]:
songs.show(10)

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|  0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   False|       0.676| 0.461|  1|  -6.746|   0|      0.143|      0.0322|         1.01E-6|   0.358|  0.715| 87.917|           4

In [5]:
# read the songs data from the CSV file
users = spark.read.csv('../data/users/users.csv', header=True, inferSchema=True)

In [6]:
users.show(10)

+-------+-----------------+--------+-------------+----------+
|user_id|        user_name|user_age| user_country|created_at|
+-------+-----------------+--------+-------------+----------+
|      1|     Norma Fisher|      65|United States|2024-02-07|
|      2|   Jorge Sullivan|      28|United States|2024-11-28|
|      3|  Elizabeth Woods|      19|United States|2024-11-16|
|      4|     Susan Wagner|      45|United States|2024-06-14|
|      5| Peter Montgomery|      61|United States|2024-07-24|
|      6| Theodore Mcgrath|      58|United States|2024-12-12|
|      7|Stephanie Collins|      68|United States|2024-04-16|
|      8| Stephanie Sutton|      53|United States|2024-05-04|
|      9|   Brian Hamilton|      34|United States|2024-09-15|
|     10|       Susan Levy|      18|United States|2024-09-08|
+-------+-----------------+--------+-------------+----------+
only showing top 10 rows



In [7]:
# read the streams data from the CSV file
streams1 = spark.read.csv('../data/streams/streams1.csv', header=True, inferSchema=True)
streams2 = spark.read.csv('../data/streams/streams2.csv', header=True, inferSchema=True)
streams3 = spark.read.csv('../data/streams/streams3.csv', header=True, inferSchema=True)

In [8]:
# concat the streams data
streams = streams1.union(streams2).union(streams3)
streams.show(10)


+-------+--------------------+-------------------+
|user_id|            track_id|        listen_time|
+-------+--------------------+-------------------+
|  26213|4dBa8T7oDV9WvGr7k...|2024-06-25 17:43:13|
|   6937|4osgfFTICMkcGbbig...|2024-06-25 07:26:00|
|  21407|2LoQWx41KeqOrSFra...|2024-06-25 13:25:26|
|  47146|7cfG5lFeJWEgpSnub...|2024-06-25 18:17:50|
|  38594|6tilCYbheGMHo3Hw4...|2024-06-25 17:33:21|
|  14209|2QuOheWJqShIBIYC1...|2024-06-25 02:52:20|
|  26986|6qBSGvyUzqNQv8Xtn...|2024-06-25 22:32:51|
|   8173|1wXSL0SAzd7mX0LM8...|2024-06-25 11:59:10|
|  12950|0L7Nv6ToXLRAWId4e...|2024-06-25 17:54:30|
|   2898|7tnE9vy6FCRtbZql5...|2024-06-25 18:30:31|
+-------+--------------------+-------------------+
only showing top 10 rows



In [9]:
# rename the first track_id column to avoid ambiguity
streams = streams.withColumnRenamed("track_id", "track_id_1")

In [10]:
streams.describe().show()

+-------+------------------+--------------------+
|summary|           user_id|          track_id_1|
+-------+------------------+--------------------+
|  count|             34038|               34038|
|   mean|24934.808420001176|                NULL|
| stddev|14444.125256146368|                NULL|
|    min|                 3|0000vdREvCVMxbQTk...|
|    max|             49999|7zxpdh3EqMq2JCkOI...|
+-------+------------------+--------------------+



In [11]:
songs.describe().show()

+-------+-----------------+--------------------+------------------+----------------------------+--------------+------------------+--------------------+------------------+-----------------+--------------------+-----------------+------------------+----------------+------------------+-------------------+------------------+-----------------+-------------------+-----------------+------------------+------------------+
|summary|               id|            track_id|           artists|                  album_name|    track_name|        popularity|         duration_ms|          explicit|     danceability|              energy|              key|          loudness|            mode|       speechiness|       acousticness|  instrumentalness|         liveness|            valence|            tempo|    time_signature|       track_genre|
+-------+-----------------+--------------------+------------------+----------------------------+--------------+------------------+--------------------+-----------------

In [12]:
users.describe().show()

+-------+-----------------+--------------+------------------+-------------+
|summary|          user_id|     user_name|          user_age| user_country|
+-------+-----------------+--------------+------------------+-------------+
|  count|            50000|         50000|             50000|        50000|
|   mean|          25000.5|          NULL|          43.56998|         NULL|
| stddev|14433.90106658626|          NULL|14.996324902949087|         NULL|
|    min|                1|Aaron Alvarado|                18|    Australia|
|    max|            50000|   Zoe Walters|                69|United States|
+-------+-----------------+--------------+------------------+-------------+



In [13]:
song_users = songs.join(streams, songs.track_id == streams.track_id_1, 'inner') \
                  .join(users, streams.user_id == users.user_id, 'inner') \
                  .select(songs['*'], 
                         streams.user_id.alias('stream_user_id'), 
                         streams.track_id_1, 
                         streams.listen_time,
                         users.user_id.alias('user_user_id'),
                         users.user_name,
                         users.user_age,
                         users.user_country,
                         users.created_at)

song_users.show()

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------+--------------------+-------------------+------------+-----------------+--------+-------------+----------+
| id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|stream_user_id|          track_id_1|        listen_time|user_user_id|        user_name|user_age| user_country|created_at|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+---------

In [14]:
song_users.describe().show()

+-------+------------------+--------------------+------------------+----------------------------+--------------+------------------+--------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+--------------+------------------+-------------+
|summary|                id|            track_id|           artists|                  album_name|    track_name|        popularity|         duration_ms|          explicit|     danceability|             energy|               key|          loudness|              mode|        speechiness|      acousticness|   instrumentalness|         liveness|           valence|             tempo|    time_signature|       track_genre|    stream_user_id|          track_id_1|      user_use

In [15]:
song_users.columns

['id',
 'track_id',
 'artists',
 'album_name',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre',
 'stream_user_id',
 'track_id_1',
 'listen_time',
 'user_user_id',
 'user_name',
 'user_age',
 'user_country',
 'created_at']

In [16]:
listen_count = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                 .agg(F.count("track_id").alias("listen_count"))
listen_count.show()

+-------------+----------+------------+
|  track_genre|      date|listen_count|
+-------------+----------+------------+
|     acoustic|2024-05-20|           1|
|     acoustic|2024-08-13|           2|
|     afrobeat|2024-11-23|           3|
|     alt-rock|2024-06-03|           1|
|     alt-rock|2024-09-25|           2|
|      ambient|2024-11-06|           1|
|      ambient|2024-08-08|           1|
|  black-metal|2024-04-10|           3|
|    bluegrass|2024-04-22|           1|
|     cantopop|2024-02-26|           2|
|     cantopop|2024-04-11|           1|
|     cantopop|2024-01-06|           2|
|chicago-house|2024-06-27|           1|
|chicago-house|2024-02-08|           1|
|     children|2024-02-03|           1|
|        chill|2024-11-07|           1|
|    classical|2024-03-12|           1|
|         club|2024-08-24|           1|
|         club|2024-02-03|           2|
|       comedy|2024-03-19|           2|
+-------------+----------+------------+
only showing top 20 rows



In [17]:
song_users.columns

['id',
 'track_id',
 'artists',
 'album_name',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre',
 'stream_user_id',
 'track_id_1',
 'listen_time',
 'user_user_id',
 'user_name',
 'user_age',
 'user_country',
 'created_at']

In [18]:
unique_listeners = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                     .agg(F.countDistinct("user_user_id").alias("unique_listeners"))
unique_listeners.show()

+-------------+----------+----------------+
|  track_genre|      date|unique_listeners|
+-------------+----------+----------------+
|         club|2024-02-03|               2|
|     afrobeat|2024-11-23|               3|
|chicago-house|2024-02-08|               1|
|        anime|2024-08-24|               2|
|     cantopop|2024-02-26|               2|
|    classical|2024-03-12|               1|
|        dance|2024-04-11|               2|
|     cantopop|2024-01-06|               2|
|     acoustic|2024-05-20|               1|
|    bluegrass|2024-04-22|               1|
|        anime|2024-11-02|               1|
|     children|2024-02-03|               1|
|chicago-house|2024-06-27|               1|
|     acoustic|2024-08-13|               2|
|        chill|2024-11-07|               1|
|     alt-rock|2024-09-25|               2|
|      country|2024-05-18|               1|
|  black-metal|2024-04-10|               3|
|     alt-rock|2024-06-03|               1|
|       comedy|2024-02-13|      

In [19]:
total_listening_time = song_users.groupBy("track_genre", F.to_date("created_at").alias("date")) \
                         .agg(F.sum("duration_ms").alias("total_listening_time"))
total_listening_time.show()

+-------------+----------+--------------------+
|  track_genre|      date|total_listening_time|
+-------------+----------+--------------------+
|     acoustic|2024-05-20|            131760.0|
|     acoustic|2024-08-13|            587081.0|
|     afrobeat|2024-11-23|            677106.0|
|     alt-rock|2024-06-03|            239373.0|
|     alt-rock|2024-09-25|            412917.0|
|      ambient|2024-11-06|            204000.0|
|      ambient|2024-08-08|            273466.0|
|  black-metal|2024-04-10|            821937.0|
|    bluegrass|2024-04-22|            177946.0|
|     cantopop|2024-02-26|            444346.0|
|     cantopop|2024-04-11|            230613.0|
|     cantopop|2024-01-06|            546319.0|
|chicago-house|2024-06-27|            469294.0|
|chicago-house|2024-02-08|            244218.0|
|     children|2024-02-03|            131897.0|
|        chill|2024-11-07|             97050.0|
|    classical|2024-03-12|            182158.0|
|         club|2024-08-24|            11

In [20]:
avg_listening_time_per_user = total_listening_time.join(unique_listeners, ["track_genre", "date"]) \
                                                  .withColumn("avg_listening_time_per_user", 
                                                              F.col("total_listening_time") / F.col("unique_listeners"))
avg_listening_time_per_user.show()

+-------------+----------+--------------------+----------------+---------------------------+
|  track_genre|      date|total_listening_time|unique_listeners|avg_listening_time_per_user|
+-------------+----------+--------------------+----------------+---------------------------+
|         club|2024-02-03|            439329.0|               2|                   219664.5|
|     afrobeat|2024-11-23|            677106.0|               3|                   225702.0|
|chicago-house|2024-02-08|            244218.0|               1|                   244218.0|
|        anime|2024-08-24|            391406.0|               2|                   195703.0|
|     cantopop|2024-02-26|            444346.0|               2|                   222173.0|
|    classical|2024-03-12|            182158.0|               1|                   182158.0|
|        dance|2024-04-11|            390530.0|               2|                   195265.0|
|     cantopop|2024-01-06|            546319.0|               2|      

In [66]:
# Step 2: Compute listen count per song per genre per day
song_listen_count = song_users.groupBy( 
    F.to_date("created_at").alias("date"), 
    "track_name",
    "track_genre"
).agg(F.countDistinct("track_id").alias("listen_count"))

# Step 3: Define ranking window for top songs per genre per day
song_rank_window = Window.partitionBy("date").orderBy(F.desc("listen_count"))

# Step 4: Rank songs and filter for the top 3 per genre per day
top_songs_per_genre = song_listen_count.withColumn("rank", F.rank().over(song_rank_window)) \
                                     .filter(F.col("rank") <= 3)

# Step 5: Show results
top_songs_per_genre.show()


+----------+--------------------+-------------+------------+----+
|      date|          track_name|  track_genre|listen_count|rank|
+----------+--------------------+-------------+------------+----+
|2024-01-01|Christmas Don't B...|        j-pop|           2|   1|
|2024-01-01|Sirens for the Co...|          idm|           1|   2|
|2024-01-01|                 Wot|          idm|           1|   2|
|2024-01-01| Aprender Com a Vida|       brazil|           1|   2|
|2024-01-01| The Christmas Waltz|         jazz|           1|   2|
|2024-01-01|Se Joga Na Minha ...|    sertanejo|           1|   2|
|2024-01-01|           miletones|        study|           1|   2|
|2024-01-01|           MoonShake|chicago-house|           1|   2|
|2024-01-01|Die Fischerin vom...|        party|           1|   2|
|2024-01-01|          Me Refazer|       gospel|           1|   2|
|2024-01-01|Intimeeda Sukka P...|    classical|           1|   2|
|2024-01-01|         Otra Salida|      spanish|           1|   2|
|2024-01-0

In [100]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


# Step 1: Compute listen count per genre per day
genre_listen_count = song_users.groupBy(
    F.to_date("created_at").alias("date"),
    "track_genre"
).agg(F.count("track_id").alias("genre_listen_count"))

# Step 2: Define a ranking window for top genres per day
genre_rank_window = Window.partitionBy("date").orderBy(F.desc("genre_listen_count"))

# Step 3: Apply ranking and filter for the top 5 genres per day
top_genres_per_day = genre_listen_count.withColumn("5_gen_rank", F.rank().over(genre_rank_window)) \
                                       .filter(F.col("5_gen_rank") <= 5)

top_genres_per_day.show()


+----------+-----------------+------------------+----------+
|      date|      track_genre|genre_listen_count|5_gen_rank|
+----------+-----------------+------------------+----------+
|2024-01-01|            j-pop|                 5|         1|
|2024-01-01|            tango|                 4|         2|
|2024-01-01|        classical|                 3|         3|
|2024-01-01|        bluegrass|                 3|         3|
|2024-01-01|             goth|                 3|         3|
|2024-01-01|           gospel|                 3|         3|
|2024-01-01|         mandopop|                 3|         3|
|2024-01-01|progressive-house|                 3|         3|
|2024-01-01|          new-age|                 3|         3|
|2024-01-01|      world-music|                 3|         3|
|2024-01-01|          spanish|                 3|         3|
|2024-01-02|             club|                 4|         1|
|2024-01-02|         cantopop|                 4|         1|
|2024-01-02|      black-

In [None]:
final_kpi_table = listen_count.join(unique_listeners,["track_genre","date"]) \
                              .join(total_listening_time,["track_genre","date"]) \
                              .join(avg_listening_time_per_user,["track_genre","date"]) \
                              .join(top_songs_per_genre,["track_genre","date"]) \
                              .join(top_genres_per_day,["track_genre","date"])
final_kpi_table.show()

+-----------+----------+----+------------+--------------------+----------------+---------------------------+--------------------+------------+
|track_genre|      date|rank|listen_count|total_listening_time|unique_listeners|avg_listening_time_per_user|          track_name|listen_count|
+-----------+----------+----+------------+--------------------+----------------+---------------------------+--------------------+------------+
|     brazil|2024-12-28|   1|           4|            833488.0|               4|                   208372.0|Na Rua, Na Chuva,...|           1|
|     brazil|2024-12-28|   1|           4|            833488.0|               4|                   208372.0|             Domingo|           1|
|     brazil|2024-12-28|   1|           4|            833488.0|               4|                   208372.0|     Porque Ele Vive|           1|
|     brazil|2024-12-28|   1|           4|            833488.0|               4|                   208372.0|      Passos Escuros|           1|