In [252]:
# imports
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import LongType, StringType, StructField, StructType, IntegerType
from pyspark.sql import SparkSession


In [253]:
# make an instance of a SparkSession called 'spark'
spark = SparkSession.builder.master('local').getOrCreate()


In [254]:
# create a custom schema to ensure data types and column names
custom_schema = StructType([
    StructField('index', IntegerType()),
    StructField('artist_popularity', LongType()),
    StructField('followers', LongType()),
    StructField('genres', StringType()),
    StructField('id', StringType()),
    StructField('name', StringType()),
    StructField('track_id', StringType()),
    StructField('track_name_prev', StringType()),
    StructField('type', StringType())])


In [255]:
"""Extract Information"""
# create a spark dataframa from a csv with our custom schema
spark_df = (spark.read.format("csv"
                        ).options(header="true"
                        ).schema(custom_schema
                        ).load("data/spotify_artists.csv"))


In [256]:
# Show a description (summary) of the Spark DataFrame.
spark_df.describe


<bound method DataFrame.describe of DataFrame[index: int, artist_popularity: bigint, followers: bigint, genres: string, id: string, name: string, track_id: string, track_name_prev: string, type: string]>

In [257]:
# Print the schema of the DataFrame.
spark_df.printSchema()


root
 |-- index: integer (nullable = true)
 |-- artist_popularity: long (nullable = true)
 |-- followers: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name_prev: string (nullable = true)
 |-- type: string (nullable = true)



In [258]:
# Select and show just the first 10 values in the 'name' and 'genres' columns.
spark_df.select('name', 'genres').show(10)


+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|                  []|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|                  []|
|                Eloq|                  []|
|              Fravær|                  []|
|       Camille Pépin|                  []|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 rows



In [259]:
# Where the genre is an empty list, replace it with ['elevator music'].
spark_df = spark_df.withColumn('genres', regexp_replace(
    'genres', r"\[\]", "['elevator music']"))
spark_df.select('name', 'genres').show(10)


+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|  ['elevator music']|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|  ['elevator music']|
|                Eloq|  ['elevator music']|
|              Fravær|  ['elevator music']|
|       Camille Pépin|  ['elevator music']|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 rows



In [262]:
# For the columns 'artist_popularity' and 'followers', cast the data type as integers.
# done in the schema, but ill throw in the code, so you know i know it
spark_df = spark_df.withColumn('artist_popularity', col(
    'artist_popularity').cast(IntegerType()))
spark_df = spark_df.withColumn('followers', col(
    'followers').cast(IntegerType()))
spark_df.printSchema()


root
 |-- index: integer (nullable = true)
 |-- artist_popularity: integer (nullable = true)
 |-- followers: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name_prev: string (nullable = true)
 |-- type: string (nullable = true)



In [261]:
# Sort the data in descending order by number of followers.
spark_df.select('*').sort('followers', ascending=False).show(10)


+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|index|artist_popularity|followers|              genres|                  id|         name|            track_id|track_name_prev|  type|
+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|55251|               92| 41561693|   ['pop', 'uk pop']|6eUKZXaKkcviH0Ku9...|   Ed Sheeran|7qiZfU4dY1lWllzX7...|       track_35|artist|
|53392|               98| 34680740|['canadian hip ho...|3TVXtAsR1Inumwj47...|        Drake|116H0KvKr2Zl4RPuV...|       track_71|artist|
|52620|               90| 30560149|['dance pop', 'po...|5pKCCKE2ajJHZ9KAi...|      Rihanna|2Ce5IyMlVRVvN997Z...|       track_38|artist|
|54447|               88| 26824224|['canadian pop', ...|1uNFoZAHBGtllmzzn...|Justin Bieber|3A7qX2QjDlPnazUsR...|        track_2|artist|
|42872|              100| 26309771|['dance pop',

In [264]:
"""process to change the artist popularity to percent and rename the column"""
# user defined function to divide x by 100
pop_contest = udf(lambda x: x/100)
# apply our udf to the column artist popularity
pop_contest_df = spark_df.withColumn('artist_popularity',
                        pop_contest(spark_df['artist_popularity']))
# Rename the column 'popularity_percent'.
pop_contest_df = spark_df.withColumnRenamed('artist_popularity',
                                            'popularity_percent')
# change datatype of the percent column from a string to a float
pop_contest_df = pop_contest_df.withColumn('popularity_percent',
                        pop_contest_df['popularity_percent'
                                ].cast('float'))
pop_contest_df.show(10)


+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|index|popularity_percent|followers|              genres|                  id|                name|            track_id|track_name_prev|  type|
+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|    0|              44.0|    23230|['sertanejo', 'se...|4mGnpjhqgx4RUdsIJ...|       Juliano Cezar|0wmDmAILuW9e2aRtt...|        track_9|artist|
|    1|              22.0|      313|  ['elevator music']|1dLnVku4VQUOLswwD...|      The Grenadines|4wqwj0gA8qPZKLl5W...|       track_30|artist|
|    2|              26.0|     1596| ['danish pop rock']|6YVY310fjfUzKi8hi...|             Gangway|1bFqWDbvHmZe2f4Nf...|       track_38|artist|
|    3|              31.0|      149|['uk alternative ...|2VElyouiCfoYPDJlu...|               FADES|3MFSUBAidPzRBbIS7...|       track_34|

In [265]:
"""Extract Information"""
# Group the data by artist popularity, and show the count for each group.
spark_df.groupBy('artist_popularity').sum('artist_popularity').show(10)


+-----------------+----------------------+
|artist_popularity|sum(artist_popularity)|
+-----------------+----------------------+
|               31|                 36053|
|               85|                  1530|
|               65|                 19110|
|               53|                 37895|
|               78|                  3822|
|               34|                 41956|
|               81|                  2511|
|               28|                 30212|
|               76|                  5092|
|               26|                 29562|
+-----------------+----------------------+
only showing top 10 rows



In [267]:
"""save dataframe as a parquet"""
# write the code to save the DataFrame as a Parquet file in the /data directory.
spark_df.write.parquet('data/spotify.parquet')


                                                                                