In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').getOrCreate()


22/06/24 09:26:08 WARN Utils: Your hostname, DESKTOP-8ASBMTF resolves to a loopback address: 127.0.1.1; using 172.20.184.225 instead (on interface eth0)
22/06/24 09:26:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/24 09:26:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
spark_df = (spark.read.format("csv").options(header="true").load("./data/spotify_artists.csv"))

## Profiling the Data:

In [5]:
# Show a description (summary) of the Spark DataFrame.
spark_df.describe

<bound method DataFrame.describe of DataFrame[index: string, artist_popularity: string, followers: string, genres: string, id: string, name: string, track_id: string, track_name_prev: string, type: string]>

In [24]:
# Print the schema of the DataFrame.
spark_df.printSchema()

root
 |-- index: string (nullable = true)
 |-- artist_popularity: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name_prev: string (nullable = true)
 |-- type: string (nullable = true)



In [7]:
# Select and show just the first 10 values in the 'name' and 'genres' columns.
spark_df.select(spark_df.name, spark_df.genres).show(10)

+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|                  []|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|                  []|
|                Eloq|                  []|
|              Fravær|                  []|
|       Camille Pépin|                  []|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 rows



## Cleaning the Data:

In [22]:
# Where the genre is an empty list, replace it with ['elevator music']
from pyspark.sql.functions import regexp_replace
spark_df.where(spark_df.genres == "[]").show(5)
spark_df = spark_df.withColumn('genres', regexp_replace('genres', r"\[\]", "['elevator music']"))
spark_df.where(spark_df.genres == "['elevator music']").show(5)


                                                                                

+-----+-----------------+---------+------+---+----+--------+---------------+----+
|index|artist_popularity|followers|genres| id|name|track_id|track_name_prev|type|
+-----+-----------------+---------+------+---+----+--------+---------------+----+
+-----+-----------------+---------+------+---+----+--------+---------------+----+

+-----+-----------------+---------+------------------+--------------------+--------------+--------------------+---------------+------+
|index|artist_popularity|followers|            genres|                  id|          name|            track_id|track_name_prev|  type|
+-----+-----------------+---------+------------------+--------------------+--------------+--------------------+---------------+------+
|    1|               22|      313|['elevator music']|1dLnVku4VQUOLswwD...|The Grenadines|4wqwj0gA8qPZKLl5W...|       track_30|artist|
|    5|               43|       81|['elevator music']|38VBjthd0szbS6wpD...|        Filhos|453KeZU566kjNfs1I...|       track_15|arti

In [29]:
# For the columns 'artist_popularity' and 'followers', cast the data type as integers.
from pyspark.sql.types import IntegerType

spark_df = spark_df.withColumn('artist_popularity', spark_df['artist_popularity'].cast(IntegerType()))
spark_df = spark_df.withColumn('followers', spark_df['followers'].cast(IntegerType()))
spark_df.select('artist_popularity', 'followers').printSchema()

root
 |-- artist_popularity: integer (nullable = true)
 |-- followers: integer (nullable = true)



In [33]:
# Sort the data in descending order by number of followers.

spark_df = spark_df.orderBy('followers', ascending=False)
spark_df.show(10)

+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|index|artist_popularity|followers|              genres|                  id|         name|            track_id|track_name_prev|  type|
+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|55251|               92| 41561693|   ['pop', 'uk pop']|6eUKZXaKkcviH0Ku9...|   Ed Sheeran|7qiZfU4dY1lWllzX7...|       track_35|artist|
|53392|               98| 34680740|['canadian hip ho...|3TVXtAsR1Inumwj47...|        Drake|116H0KvKr2Zl4RPuV...|       track_71|artist|
|52620|               90| 30560149|['dance pop', 'po...|5pKCCKE2ajJHZ9KAi...|      Rihanna|2Ce5IyMlVRVvN997Z...|       track_38|artist|
|54447|               88| 26824224|['canadian pop', ...|1uNFoZAHBGtllmzzn...|Justin Bieber|3A7qX2QjDlPnazUsR...|        track_2|artist|
|42872|              100| 26309771|['dance pop',

                                                                                