In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').getOrCreate()

In [None]:
spark_df = (spark.read.format("csv").options(header="true").load("./data/spotify_artists.csv"))

## Profiling the Data:

In [None]:
# Show a description (summary) of the Spark DataFrame.
spark_df.describe

In [None]:
# Print the schema of the DataFrame.
spark_df.printSchema()

In [None]:
# Select and show just the first 10 values in the 'name' and 'genres' columns.
spark_df.select(spark_df.name, spark_df.genres).show(10)

## Cleaning the Data:

In [None]:
# Where the genre is an empty list, replace it with ['elevator music']
from pyspark.sql.functions import regexp_replace
spark_df.where(spark_df.genres == "[]").show(5)
spark_df = spark_df.withColumn('genres', regexp_replace('genres', r"\[\]", "['elevator music']"))
spark_df.where(spark_df.genres == "['elevator music']").show(5)


In [None]:
# For the columns 'artist_popularity' and 'followers', cast the data type as integers.
from pyspark.sql.types import IntegerType

spark_df = spark_df.withColumn('artist_popularity', spark_df['artist_popularity'].cast(IntegerType()))
spark_df = spark_df.withColumn('followers', spark_df['followers'].cast(IntegerType()))
spark_df.select('artist_popularity', 'followers').printSchema()

In [None]:
# Sort the data in descending order by number of followers.

spark_df = spark_df.orderBy('followers', ascending=False)
spark_df.show(10)

In [None]:
# 'artist_popularity' is a rank out of 100. Write a user defined function that will divide each popularity value by 100. Rename the column 'popularity_percent'.
from pyspark.sql.functions import udf

percent = udf(lambda x: x/100)

spark_df = spark_df.withColumn('popularity_percent', percent('artist_popularity')).show(10)

## Extracting Information

In [None]:
# Show only the values in the DataFrame that have 'Queen' in the name

spark_df.filter(spark_df.name.contains('Queen')).show(5)

spark_df.createOrReplaceTempView('spotify')
spark.sql("SELECT * FROM spotify WHERE name LIKE '%Queen%'").show(5)


In [None]:
# Group the data by artist popularity, and show the count for each group.

spark_df.groupBy('artist_popularity').sum('artist_popularity').show(10)

In [None]:
# save the DataFrame as a Parquet file in the /data directory.

spark_df.write.parquet("./data/spotify_artists.parquet")