In [33]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Artists Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://spotify_data_de/artists.csv'
# gsc_file_path = 'gs://spotify_data_de/tester2.csv'

# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)

df.printSchema()

root
 |-- mbid: string (nullable = true)
 |-- artist_mb: string (nullable = true)
 |-- artist_lastfm: string (nullable = true)
 |-- country_mb: string (nullable = true)
 |-- country_lastfm: string (nullable = true)
 |-- tags_mb: string (nullable = true)
 |-- tags_lastfm: string (nullable = true)
 |-- listeners_lastfm: string (nullable = true)
 |-- scrobbles_lastfm: string (nullable = true)
 |-- ambiguous_artist: string (nullable = true)



In [34]:
#Change datatypings to the correct type hints and selecting subset
from pyspark.sql.functions import *
from pyspark.sql.types import NumericType, IntegerType
df = df.na.drop(subset=['artist_mb','country_mb', 'listeners_lastfm'])
df = df.select('artist_mb','country_mb', 'listeners_lastfm')

df = df.withColumn("listeners_lastfm",col("listeners_lastfm").cast(IntegerType()))

df.printSchema()

root
 |-- artist_mb: string (nullable = true)
 |-- country_mb: string (nullable = true)
 |-- listeners_lastfm: integer (nullable = true)



In [35]:
#Bestand worst performing artist per country
from pyspark.sql import Row, Window

windowdesc = Window.partitionBy(col('country_mb')).orderBy(col('listeners_lastfm').desc())
windowasc = Window.partitionBy(col('country_mb')).orderBy(col('listeners_lastfm').asc())

artists_merged_windowed = df.withColumn('rank_desc', dense_rank().over(windowdesc)).withColumn('rank_asc', dense_rank().over(windowasc))

worst_best_artist_table = artists_merged_windowed.where((col('rank_desc') == 1) | (col('rank_asc') == 1) ).select('*')

worst_best_artist_table2 = worst_best_artist_table.withColumn('performance_category', when(col('rank_desc') == 1, 'Most listeners')
                                                          .otherwise('Least listeners')).select('country_mb', 'performance_category', 'artist_mb', 'listeners_lastfm')
worst_best_artist_table2.show(10)

+-------------------+--------------------+--------------------+----------------+
|         country_mb|performance_category|           artist_mb|listeners_lastfm|
+-------------------+--------------------+--------------------+----------------+
|         Bumblefoot|      Most listeners| Steve “Lips” Kudlow|            null|
| id. Csillag Ferenc|      Most listeners|"Farkas Rudolf ""...|            null|
|        Afghanistan|     Least listeners|Abdulhakím Khudáy...|               0|
|        Afghanistan|     Least listeners|     Akhmad Bakhhshi|               0|
|        Afghanistan|      Most listeners|                Adam|           23540|
|            Albania|     Least listeners|            Dardanny|               1|
|            Albania|     Least listeners|    Florian Mumajesi|               1|
|            Albania|     Least listeners|      Jorgo Papingji|               1|
|            Albania|     Least listeners|         Sokol Marsi|               1|
|            Albania|      M

In [36]:
#Drop duplicates and keep first for each least/most per country
worst_best_artist_table2.dropDuplicates(['country_mb', 'performance_category']).show()

+--------------------+--------------------+--------------------+----------------+
|          country_mb|performance_category|           artist_mb|listeners_lastfm|
+--------------------+--------------------+--------------------+----------------+
|          Bumblefoot|      Most listeners| Steve “Lips” Kudlow|            null|
|  id. Csillag Ferenc|      Most listeners|"Farkas Rudolf ""...|            null|
|         Afghanistan|     Least listeners|Abdulhakím Khudáy...|               0|
|         Afghanistan|      Most listeners|                Adam|           23540|
|             Albania|     Least listeners|            Dardanny|               1|
|             Albania|      Most listeners|          King Ku$ha|           14046|
|             Algeria|     Least listeners|            H'sissen|               0|
|             Algeria|      Most listeners|              Khaled|          179375|
|              Angola|     Least listeners|      Isodora Campos|               0|
|              A

In [37]:
bucket = "spotify_data_de"
spark.conf.set('temporaryGcsBucket', bucket)
# Saving the data to BigQuery
worst_best_artist_table2.write.format('bigquery') \
  .option('table', 'de2022-362617.spotify.artists') \
  .mode("overwrite") \
  .save()

In [32]:
# Stop the spark context
spark.stop()