In [2]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
def create_spark_session():
    spark = SparkSession.builder.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

# Process song-data

### Process songs

In [6]:
sd_input = config['LOCAL']['SONG_DATA']
output_data = config['LOCAL']['OUTPUT_DATA']

In [7]:
df_sd = spark.read.json(sd_input)
df_sd.printSchema()
df_sd.createOrReplaceTempView("song_data")

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [8]:
df_sd.count()

71

In [9]:
songs = spark.sql("""
    SELECT distinct song_id, title, artist_id, year, duration 
    FROM song_data
""")

In [10]:
songs.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|ARB29H41187B98F0EF|1972|342.56934|
|SOOJPRH12A8C141995|   Loaded Like A Gun|ARBGXIG122988F409D|   0|173.19138|
|SOFCHDR12AB01866EF|         Living Hell|AREVWGE1187B9B890A|   0|282.43546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [16]:
songs_path = output_data + 'songs.parquet'
songs.write.mode("overwrite").partitionBy("year", "artist_id")\
        .parquet(songs_path)

### Process artists

In [25]:
%%time
df_sd.createOrReplaceTempView("artist_data")

CPU times: user 898 µs, sys: 141 µs, total: 1.04 ms
Wall time: 21 ms


In [26]:
df_sd.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [28]:
%%time
atrists = spark.sql("""
    SELECT distinct artist_id, 
        artist_name as name,
        artist_location as location, 
        artist_latitude as lattitude,
        artist_longitude as longitude
    FROM artist_data
""")

CPU times: user 931 µs, sys: 155 µs, total: 1.09 ms
Wall time: 23.4 ms


In [33]:
%%time
atrists.groupby('artist_id').count().filter('count > 1').show(5)

+---------+-----+
|artist_id|count|
+---------+-----+
+---------+-----+

CPU times: user 0 ns, sys: 3.62 ms, total: 3.62 ms
Wall time: 3.75 s


In [34]:
atrists.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- lattitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [35]:
%%time
atrists_path = output_data + 'atrists_table'
atrists.write.mode("overwrite").parquet(atrists_path)

CPU times: user 1.78 ms, sys: 312 µs, total: 2.09 ms
Wall time: 4.78 s


### Process song_data

In [40]:
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data
    
    # read song data file
    print('start reading song data')
    start = datetime.now()
    df = spark.read.json(song_data)
    print('finished:', f"{(datetime.now() - start).total_seconds()} s")
    df.printSchema()
    print('------------')

    # extract columns to create songs table
    
    print('start songs columns extractaction')
    start = datetime.now()
    df.createOrReplaceTempView("songs_table")
    songs_table = spark.sql("""
        SELECT distinct song_id, title, artist_id, year, duration 
        FROM songs_table
    """)
    print('finished:', f"{(datetime.now() - start).total_seconds()} s")
    songs_table.printSchema()
    songs_table.show(5)
    print('------------')
    
    # write songs table to parquet files partitioned by year and artist
    print('Start songs table writting')
    start = datetime.now()
    songs_path = output_data + 'songs_table'
    songs_table.write.mode("overwrite").partitionBy("year", "artist_id")\
        .parquet(songs_path)
    print('finished:', f"{(datetime.now() - start).total_seconds()} s")
    print('------------')
    
    # extract columns to create artists table
    print('start atrists columns extraction')
    start = datetime.now()
    df.createOrReplaceTempView("artists_table")
    artists_table = spark.sql("""
        SELECT distinct artist_id, 
            artist_name as name,
            artist_location as location, 
            artist_latitude as lattitude,
            artist_longitude as longitude
        FROM artist_data
    """)
    print('finished:', f"{(datetime.now() - start).total_seconds()} s")
    artists_table.printSchema()
    artists_table.show(5)
    print('------------')
    
    # write artists table to parquet files
    print('Start songs table writting')
    start = datetime.now()
    atrists_path = output_data + 'atrists_table'
    artists_table.write.mode("overwrite").parquet(atrists_path)
    print('finished:', f"{(datetime.now() - start).total_seconds()} s")
    print('------------')

In [42]:
input_data = config['LOCAL']['SONG_DATA']
output_data = config['LOCAL']['OUTPUT_DATA']
process_song_data(spark, input_data, output_data)

start reading song data
finished: 1.374122 s
root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

------------
start songs columns extractaction
finished: 0.02306 s
root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: long (nullable = true)
 |-- duration: double (nullable = true)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGNCJP12A58A80271|Do You Finally Ne...|A