In [21]:
%pip install kafka-python pyspark==3.3.2 findspark

Note: you may need to restart the kernel to use updated packages.


In [22]:
from kafka import KafkaConsumer, KafkaProducer
from kafka.consumer.fetcher import ConsumerRecord
import json
import asyncio
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
import findspark
from pyspark.sql.functions import from_csv, col, avg, round as _round

In [23]:
findspark.init(os.environ["SPARK_HOME"])

In [24]:
TOPIC = "KafkaMusicStream"

In [25]:
consumer = KafkaConsumer(TOPIC, bootstrap_servers="localhost:9092", consumer_timeout_ms=5000)
producer = KafkaProducer(bootstrap_servers="localhost:9092")



In [26]:
async def run_consumer(consumer: KafkaConsumer):
    for msg in consumer:
        await asyncio.sleep(1)
        assert isinstance(msg, ConsumerRecord), "Invalid Type"
        print(json.loads(msg.value))
        
def run_producer(producer: KafkaProducer):
    with open("data/SpotifyFeatures.csv") as f:
        f.readline() # first line
        
        while((line := f.readline()) != ""):
            producer.send(TOPIC, line.encode())
        

In [27]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local[*]")
         .appName("KafkaSqlStream")
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0")
         .config("spark.sql.warehouse.dir", "spark-warehouse")  
         .config("spark.driver.memory", "4g")
         .enableHiveSupport()
         .getOrCreate())

spark.sparkContext.setLogLevel("WARN")
print("Spark started:", spark)


25/10/22 19:55:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
Spark started: <pyspark.sql.session.SparkSession object at 0x7cdd089f2e10>


In [28]:
# Read from Kafka
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", TOPIC) \
    .load()
    
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_id", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("acousticness", DoubleType(), True),
    StructField("danceability", DoubleType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("energy", DoubleType(), True),
    StructField("instrumentalness", DoubleType(), True),
    StructField("key", StringType(), True),
    StructField("liveness", DoubleType(), True),
    StructField("loudness", DoubleType(), True),
    StructField("mode", StringType(), True),
    StructField("speechiness", DoubleType(), True),
    StructField("tempo", DoubleType(), True),
    StructField("time_signature", StringType(), True),
    StructField("valence", DoubleType(), True)
])

In [29]:
try:
    csv_df = kafka_df.selectExpr("CAST(value AS STRING) as csv_value")
    parsed_df = csv_df.select(from_csv("csv_value", schema.simpleString()).alias("data")).select("data.*")

    parsed_df.createOrReplaceTempView("music_stream")

    sql_df = spark.sql("""
        SELECT genre, AVG(popularity) AS avg_popularity, COUNT(*) AS count
        FROM music_stream
        GROUP BY genre
    """)

    query = sql_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", False) \
        .start()

    query.awaitTermination()
except:
    pass

25/10/22 19:55:23 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5b9bcfe6-58a5-4961-b56f-10fd208b3896. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/22 19:55:23 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+--------------+-----+
|genre|avg_popularity|count|
+-----+--------------+-----+
+-----+--------------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+------------------+-----+
|genre      |avg_popularity    |count|
+-----------+------------------+-----+
|Alternative|53.924119241192415|369  |
|Country    |40.41984732824427 |131  |
+-----------+------------------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----------------+------------------+-----+
|genre           |avg_popularity    |count|
+----------------+------------------+-----+
|World           |35.52188255992962 |9096 |
|Ska             |28.611067282767948|8874 |
|Rock            |59.61935066335886 |9272 |
|Jazz            |40.82528077982624 |9441 |
|Soundtrack      |33.96202268234315 |9646 |
|Folk            |49.94020862458329 |9299 |
|Children’s Music|54.65903988025232 |9353 |
|Classical       |29.296809680968096|9256 |
|Blues           |34.74318332963866 |9023 |
|R&B             |52.30871886120996 |8992 |
|Anime           |24.2599888080582  |8936 |
|Alternative     |50.21342977437116 |9263 |
|Movie           |12.157502569373072|7806 |
|A Capella       |9.302521008403362 |119  |
|Pop             |66.59066695077776 |9386 |
|﻿genre          |null              |1    |
|Reggaeton       |37.744313725490194|8927 |
|Soul            |47.02

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/omnissiah/miniconda3/envs/tfEnv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/omnissiah/miniconda3/envs/tfEnv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/omnissiah/miniconda3/envs/tfEnv/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


In [30]:
try:

    top_genres = (
        parsed_df.groupBy("genre")
                .agg(_round(avg("popularity"), 2).alias("avg_popularity"))
                .orderBy(col("avg_popularity").desc())
    )

    energy_dance = (
        parsed_df.groupBy("genre")
                .agg(
                    _round(avg("energy"), 3).alias("avg_energy"),
                    _round(avg("danceability"), 3).alias("avg_danceability")
                )
    )

    top_artists = (
        parsed_df.groupBy("artist_name")
                .count()
                .withColumnRenamed("count", "total_songs")
                .orderBy(col("total_songs").desc())
    )


    query1 = (
        top_genres.writeStream
                .outputMode("complete")
                .format("console")
                .option("truncate", False)
                .start()
    )

    query2 = (
        energy_dance.writeStream
                    .outputMode("complete")
                    .format("console")
                    .option("truncate", False)
                    .start()
    )

    query3 = (
        top_artists.writeStream
                .outputMode("complete")
                .format("console")
                .option("truncate", False)
                .start()
    )

    spark.streams.awaitAnyTermination()
except:
    pass

25/10/22 19:56:02 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a73554f8-c8ec-4b8f-8300-49e0b522775a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/22 19:56:02 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/10/22 19:56:02 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d11cda1b-ab8a-4af8-8e9e-3bd52bda49a8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/22 19:56:02 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+----------+----------------+
|genre|avg_energy|avg_danceability|
+-----+----------+----------------+
+-----+----------+----------------+



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+--------------+
|genre|avg_popularity|
+-----+--------------+
+-----+--------------+



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+-----------+
|artist_name|total_songs|
+-----------+-----------+
+-----------+-----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+----------------+------------------+-----+
|genre           |avg_popularity    |count|
+----------------+------------------+-----+
|World           |35.52188255992962 |9096 |
|Ska             |28.611067282767948|8874 |
|Rock            |59.61935066335886 |9272 |
|Jazz            |40.82528077982624 |9441 |
|Soundtrack      |33.96202268234315 |9646 |
|Folk            |49.94020862458329 |9299 |
|Children’s Music|54.65903988025232 |9353 |
|Classical       |29.296809680968096|9256 |
|Blues           |34.74318332963866 |9023 |
|R&B             |52.49343257443082 |9136 |
|Anime           |24.2599888080582  |8936 |
|Alternative     |50.91242198510167 |9934 |
|Movie           |11.6175537109375  |8214 |
|A Capella       |9.302521008403362 |238  |
|Pop             |66.59066695077776 |9386 |
|﻿genre          |null              |2    |
|Reggaeton       |37.744313725490194|8927 |
|Soul            |47.02

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+----------+----------------+
|genre      |avg_energy|avg_danceability|
+-----------+----------+----------------+
|R&B        |0.567     |0.651           |
|Alternative|0.697     |0.555           |
|Movie      |0.427     |0.537           |
|A Capella  |0.25      |0.412           |
|﻿genre     |null      |null            |
|Country    |0.62      |0.58            |
+-----------+----------+----------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------------+-----------+
|artist_name            |total_songs|
+-----------------------+-----------+
|Chorus                 |102        |
|Henri Salvador         |88         |
|The Singers Unlimited  |36         |
|Jean Claude Corbel     |27         |
|Glad                   |21         |
|The King's Singers     |18         |
|Leopold Stokowski      |18         |
|Hyannis Sound          |18         |
|Bernard Minet          |17         |
|Frank Ocean            |17         |
|MC6 A Cappella         |15         |
|Randy Newman           |12         |
|Linkin Park            |12         |
|Blake Shelton          |12         |
|Joji                   |12         |
|Five Finger Death Punch|11         |
|Upchurch               |11         |
|Kenny Chesney          |11         |
|Martin & les fées      |10         |
|Hank Williams, Jr.     |10         |
+-----------------------+----

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+----------------+------------------+-----+
|genre           |avg_popularity    |count|
+----------------+------------------+-----+
|World           |35.52188255992962 |18192|
|Ska             |28.611067282767948|17748|
|Rock            |59.61935066335886 |18544|
|Jazz            |40.82528077982624 |18882|
|Soundtrack      |33.96202268234315 |19292|
|Folk            |49.94020862458329 |18598|
|Children’s Music|54.65903988025232 |18706|
|Classical       |29.296809680968096|18512|
|Blues           |34.74318332963866 |18046|
|R&B             |52.30871886120996 |17984|
|Anime           |24.2599888080582  |17872|
|Alternative     |50.21342977437116 |18526|
|Movie           |12.157502569373072|15612|
|A Capella       |9.302521008403362 |238  |
|Pop             |66.59066695077776 |18772|
|﻿genre          |null              |2    |
|Reggaeton       |37.744313725490194|17854|
|Soul            |47.02

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----------------+----------+----------------+
|genre           |avg_energy|avg_danceability|
+----------------+----------+----------------+
|World           |68.917    |0.443           |
|Ska             |20.108    |0.527           |
|Rock            |23.719    |0.538           |
|Jazz            |37.696    |0.593           |
|Soundtrack      |646.106   |0.276           |
|Folk            |0.492     |0.527           |
|Children’s Music|0.707     |0.542           |
|Classical       |4909.409  |0.416           |
|Blues           |33.481    |0.528           |
|R&B             |0.564     |0.642           |
|Anime           |26.325    |0.472           |
|Alternative     |0.712     |0.542           |
|Movie           |480.286   |0.517           |
|A Capella       |0.25      |0.412           |
|Pop             |0.642     |0.64            |
|﻿genre          |null      |null            |
|Reggaeton

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+--------------+
|genre      |avg_popularity|
+-----------+--------------+
|R&B        |64.03         |
|Alternative|60.56         |
|Country    |42.3          |
|A Capella  |9.3           |
|Movie      |1.32          |
|﻿genre     |null          |
+-----------+--------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------------------+-----------+
|artist_name            |total_songs|
+-----------------------+-----------+
|Giuseppe Verdi         |1394       |
|Giacomo Puccini        |1137       |
|Kimbo Children's Music |971        |
|Nobuo Uematsu          |825        |
|Richard Wagner         |804        |
|Wolfgang Amadeus Mozart|800        |
|Randy Newman           |757        |
|Georges Bizet          |701        |
|Juice Music            |684        |
|Johann Sebastian Bach  |632        |
|Ludwig van Beethoven   |596        |
|Hans Zimmer            |559        |
|Gioachino Rossini      |491        |
|Chorus                 |480        |
|Howard Shore           |479        |
|Drake                  |477        |
|Henri Salvador         |474        |
|John Williams          |450        |
|Frédéric Chopin        |436        |
|Alan Menken            |420        |
+-----------------------+----

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----------------+--------------+
|genre           |avg_popularity|
+----------------+--------------+
|Pop             |66.59         |
|Rap             |60.53         |
|Rock            |59.62         |
|Hip-Hop         |58.42         |
|Dance           |57.28         |
|Indie           |54.7          |
|Children’s Music|54.66         |
|R&B             |52.31         |
|Alternative     |50.21         |
|Folk            |49.94         |
|Soul            |47.03         |
|Country         |46.1          |
|Jazz            |40.83         |
|Electronic      |38.06         |
|Reggaeton       |37.74         |
|Reggae          |35.59         |
|World           |35.52         |
|Blues           |34.74         |
|Soundtrack      |33.96         |
|Classical       |29.3          |
+----------------+--------------+
only showing top 20 rows

