In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, udf
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek, monotonically_increasing_id
from pyspark.sql.types import TimestampType, DateType
import os
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [4]:
song_data = 's3a://udacity-dend/song_data/A/A/A/*.json'
    
# read song data file
df = spark.read.json(song_data)

In [5]:
df.show(5)

+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARTC1LV1187B9A4858|        51.4536|Goldsmith's Colle...|        -0.01802|  The Bonzo Dog Band|301.40036|        1|SOAFBCP12A8C13CC7D|King Of Scurf (20...|1972|
|ARA23XO1187B9AF18F|       40.57885|Carteret, New Jersey|       -74.21956|     The Smithereens|  192.522|        1|SOKTJDS12AF72A25E5|Drown In My Own T...|   0|
|ARSVTNL1187B992A91|       51.50632|     London, England|        -0.12714|       Jonathan King|129.85424|        1|SOEKAZG12AB018837E|I'll Slap Your Fa...|2001|
|AR73AIO1187B9AD57B|       37.7791

In [6]:
input_data = 's3a://udacity-dend/'
output_data = 's3a://sparkify-table-bucket/'

def process_song_data(spark=spark, input_data=input_data, output_data=output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/A/A/A/*.json'
    
    # read song data file
    df = spark.read.json(song_data)
    df.show(5, truncate=False)

    # extract columns to create songs table (song_id, title, artist_id, year, duration)
    songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration').distinct()
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').mode('overwrite').parquet(output_data + 'songs')
    songs_table.show(5, truncate=False)

    # extract columns to create artists table(artist_id, name, location, lattitude, longitude)
    artists_table = df.select('artist_id', col('artist_name').alias('name'), col('artist_location').alias('location'), col('artist_latitude').alias('latitude'), col('artist_longitude').alias('longitude')).distinct()
    
    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artists')
    artists_table.show(5, truncate=False)

In [7]:
process_song_data()

+------------------+---------------+---------------------------------+----------------+-------------------------+---------+---------+------------------+------------------------------------------------------+----+
|artist_id         |artist_latitude|artist_location                  |artist_longitude|artist_name              |duration |num_songs|song_id           |title                                                 |year|
+------------------+---------------+---------------------------------+----------------+-------------------------+---------+---------+------------------+------------------------------------------------------+----+
|ARTC1LV1187B9A4858|51.4536        |Goldsmith's College, Lewisham, Lo|-0.01802        |The Bonzo Dog Band       |301.40036|1        |SOAFBCP12A8C13CC7D|King Of Scurf (2007 Digital Remaster)                 |1972|
|ARA23XO1187B9AF18F|40.57885       |Carteret, New Jersey             |-74.21956       |The Smithereens          |192.522  |1        |SOKTJDS12AF72A2

In [8]:
input_data = 's3a://udacity-dend/'
output_data = 's3a://sparkify-table-bucket/'

def process_log_data(spark = spark, input_data = input_data, output_data = output_data):

    # read log data file
    df_log = spark.read.json(input_data + 'log_data/2018/11/*.json')
    
    # filter by actions for song plays
    df_log = df_log.filter(df_log.page == 'NextSong')
    df_log.show(5, truncate=False)

    # extract columns for users table    
    users_table = df_log.select(col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level').distinct()
    
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users')
    users_table.show(5, truncate=False)

    # create timestamp column from original timestamp column in datetime format
    # neccessary to define the return type of udf as timestamp type
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts/1000),TimestampType())
    df_log = df_log.withColumn('start_time', get_timestamp(df_log.ts))
    df_log.show(5, truncate=False)
    
    # extract columns to create time table
    time_table = df_log.select('start_time') \
                    .withColumn('hour', hour('start_time')) \
                    .withColumn('day', dayofmonth('start_time')) \
                    .withColumn('week', weekofyear('start_time')) \
                    .withColumn('month', month('start_time'))\
                    .withColumn('year', year('start_time')) \
                    .withColumn('weekday', dayofweek('start_time'))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time')
    time_table.show(5, truncate=False)

    # read in song data to use for songplays table
    df_song = spark.read.json(input_data + 'song_data/A/A/A/*.json')

    # extract columns from joined song and log datasets to create songplays table 
    
    songplays_table = df_log.join(df_song, (df_log.song == df_song.title)\
                                        & (df_log.artist == df_song.artist_name)\
                                        & (df_log.length == df_song.duration), "inner") \
                            .distinct() \
                            .select('start_time', col('userId').alias('user_id'), 'level', 'song_id', \
                                    'artist_id', col('sessionId').alias('session_id'),'location', col('userAgent').alias('user_agent')) \
                            .withColumn("songplay_id", monotonically_increasing_id()).withColumn('year', year('start_time')).withColumn('month', month('start_time'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'songplays')
    songplays_table.show(5, truncate=False)

In [9]:
process_log_data()

+-----------+---------+---------+------+-------------+--------+---------+-----+------------------------------------+------+--------+-----------------+---------+----------------------------------------------+------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------+------+
|artist     |auth     |firstName|gender|itemInSession|lastName|length   |level|location                            |method|page    |registration     |sessionId|song                                          |status|ts           |userAgent                                                                                                                                |userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+------------------------------------+------+--------+-----------------+---------+----------------------------------------------+------+-------------+------------------------