In [185]:
from pyspark.sql import SparkSession

"""make an instance of a SparkSession called 'spark'."""
spark = SparkSession.builder.master('local').getOrCreate()


In [186]:
import os
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

# custom_schema = StructType([
#     StructField('index', IntegerType()),
#     StructField('artist_popularity', LongType()),
#     StructField('followers', LongType()),
#     StructField('genres', StringType()),
#     StructField('id', StringType()),
#     StructField('name', StringType()),
#     StructField('track_id', StringType()),
#     StructField('track_name_prev', StringType()),
#     StructField('type', StringType())])
# Read the Spotify artists CSV file into a Spark DataFrame
spark_df = (spark.read.format("csv"
                        ).options(header="true"
                        ).schema("index int, artist_popularity int, followers int, genres string, id string, name string, track_id string, track_name_prev string, type string"
                        ).load("data/spotify_artists.csv"))


In [187]:

spark_df.withColumnRenamed('', 'index').columns



['index',
 'artist_popularity',
 'followers',
 'genres',
 'id',
 'name',
 'track_id',
 'track_name_prev',
 'type']

In [188]:
"""Profile the Data:"""
# Show a description (summary) of the Spark DataFrame.
spark_df.describe


<bound method DataFrame.describe of DataFrame[index: int, artist_popularity: int, followers: int, genres: string, id: string, name: string, track_id: string, track_name_prev: string, type: string]>

In [189]:
# Print the schema of the DataFrame.
spark_df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- artist_popularity: integer (nullable = true)
 |-- followers: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- track_name_prev: string (nullable = true)
 |-- type: string (nullable = true)



In [190]:
# Select and show just the first 10 values in the 'name' and 'genres' columns.
spark_df.select('name', 'genres').show(10)


+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|                  []|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|                  []|
|                Eloq|                  []|
|              Fravær|                  []|
|       Camille Pépin|                  []|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 rows



In [191]:
from pyspark.sql.functions import regexp_replace
# Where the genre is an empty list, replace it with ['elevator music'].
spark_df = spark_df.withColumn('genres', regexp_replace(
    'genres', r"\[\]", "['elevator music']"))
spark_df.select('name', 'genres').show(10)


+--------------------+--------------------+
|                name|              genres|
+--------------------+--------------------+
|       Juliano Cezar|['sertanejo', 'se...|
|      The Grenadines|  ['elevator music']|
|             Gangway| ['danish pop rock']|
|               FADES|['uk alternative ...|
| Jean-Pierre Guignon|  ['french baroque']|
|              Filhos|  ['elevator music']|
|                Eloq|  ['elevator music']|
|              Fravær|  ['elevator music']|
|       Camille Pépin|  ['elevator music']|
|Pepe Willberg & T...|['classic finnish...|
+--------------------+--------------------+
only showing top 10 rows



In [192]:
from pyspark.sql.functions import col
"""For the columns 'artist_popularity' and 'followers', cast the data type as integers."""
# done in the schema, but ill throw in the code, so you know i know it
spark_df.withColumn('artist_popularity', col(
                    'artist_popularity').cast(IntegerType()))
spark_df.withColumn('followers', col(
                    'followers').cast(IntegerType()))



DataFrame[index: int, artist_popularity: int, followers: int, genres: string, id: string, name: string, track_id: string, track_name_prev: string, type: string]

In [193]:
# Sort the data in descending order by number of followers.
spark_df.select('*').sort('followers', ascending=False).show(5)


+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|index|artist_popularity|followers|              genres|                  id|         name|            track_id|track_name_prev|  type|
+-----+-----------------+---------+--------------------+--------------------+-------------+--------------------+---------------+------+
|55251|               92| 41561693|   ['pop', 'uk pop']|6eUKZXaKkcviH0Ku9...|   Ed Sheeran|7qiZfU4dY1lWllzX7...|       track_35|artist|
|53392|               98| 34680740|['canadian hip ho...|3TVXtAsR1Inumwj47...|        Drake|116H0KvKr2Zl4RPuV...|       track_71|artist|
|52620|               90| 30560149|['dance pop', 'po...|5pKCCKE2ajJHZ9KAi...|      Rihanna|2Ce5IyMlVRVvN997Z...|       track_38|artist|
|54447|               88| 26824224|['canadian pop', ...|1uNFoZAHBGtllmzzn...|Justin Bieber|3A7qX2QjDlPnazUsR...|        track_2|artist|
|42872|              100| 26309771|['dance pop',

In [194]:
"""process to change the artist popularity to percent and rename the column"""
from pyspark.sql.functions import udf

# user defined function to divide x by 100
pop_contest = udf(lambda x: x/100)
# apply our udf to the column artist popularity
spark_df = spark_df.withColumn('artist_popularity', 
                            pop_contest(spark_df['artist_popularity']))
# Rename the column 'popularity_percent'.
spark_df = spark_df.withColumnRenamed('artist_popularity', 'popularity_percent')
# print the least cool kids to chow our function is working
spark_df.select('*').sort('popularity_percent').show(20)


+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|index|popularity_percent|followers|              genres|                  id|                name|            track_id|track_name_prev|  type|
+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
| 1138|               0.0|        0|  ['elevator music']|1sJQSEWn0wfjJ7DMx...|         Arjan Bajwa|4rSmmC6OYmjVOYxr8...|        track_5|artist|
| 1929|               0.0|       41|  ['elevator music']|44jSS4FbhkbDPdef7...|           Ultracode|5Aq60ju4bZPmyFNfZ...|        track_6|artist|
| 1728|               0.0|       62|['colombian black...|7eKrWT1MKG26WndNo...|   Vobiscum Lucipher|2T7MWgu0Oo2KpVPsk...|       track_26|artist|
| 1717|               0.0|        2|  ['elevator music']|6TgWMXdgudjAaxY5O...|         Sasha Banks|15RGTXA9B8veOavTj...|       track_15|

In [None]:
"""Extract Information"""
# Show only the values in the DataFrame that have 'Queen' in the name.
# Group the data by artist popularity, and show the count for each group.
# Save as Parquet
# Lastly, write the code to save the DataFrame as a Parquet file in the /data directory.
# Your /data directory shouldn't be pushed to GitHub, so this file won't show up in the submitted repository.


In [196]:
# Show only the values in the DataFrame that have 'Queen' in the name.
spark_df.select('*').filter(spark_df.name.contains('Queen')).show()

+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|index|popularity_percent|followers|              genres|                  id|                name|            track_id|track_name_prev|  type|
+-----+------------------+---------+--------------------+--------------------+--------------------+--------------------+---------------+------+
|   40|              0.16|      695|  ['elevator music']|4SK9OzAA0K00NVsXA...|       Queen Machine|6u3RWvO7ZIIdVci1N...|       track_56|artist|
|  901|              0.43|    18224|           ['strut']|71WL5bNm5jPPpwpDc...|  Bob the Drag Queen|5IsdA6g8IFKGmC1xl...|        track_8|artist|
| 1518|               0.3|     2297|           ['benga']|2FzYw9fn2ZtQ7sZma...|Muthoni Drummer Q...|4F0e4hx3bASeaqLqS...|       track_45|artist|
| 2152|              0.22|     3244|['afropop', 'kwai...|5LFWp4p0pMURif2d7...|Mahlathini & The ...|6WbcheHRcJNMaDIkO...|       track_15|