In [53]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, to_timestamp, to_date
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import types as t
from pyspark.sql import functions as F




In [None]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS_SECRET_ACCESS_KEY']



In [2]:
spark = SparkSession \
        .builder \
        .appName("Udacity P4 Aleaume") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [None]:
spark.sparkContext.getConf().getAll()

In [3]:
spark

# Handling Song Data 

In [4]:
path = "data/song_data/*/*/*/*.json"

df = spark.read.json(path)

In [29]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [21]:
# extract columns to create songs table

songs_table= df.select(["song_id","title","artist_id","year","duration"])

In [26]:
songs_table.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- duration: double (nullable = true)



In [28]:
# write songs table to parquet files partitioned by year and artist

songs_table.write.option("header",True) \
           .partitionBy("artist_id","year") \
           .mode("overwrite") \
           .parquet("data/output/songs/")

In [34]:
# extract columns to create artists table
artists_table_raw = df.select(["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"])

artists_table = artists_table_raw.withColumnRenamed("artist_name", "name") \
                                 .withColumnRenamed("artist_location","location") \
                                 .withColumnRenamed("artist_latitude","latitude") \
                                 .withColumnRenamed("artist_longitude","longitutde")

In [35]:
artists_table.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitutde: double (nullable = true)



In [36]:
artists_table.write.save("data/output/artists/", format="parquet", header=True)

# Handling Log Data

In [4]:
# get filepath to log data file

log_data = "data/log_data/*.json"


In [116]:
# read log data file

df = spark.read.json(log_data)


In [18]:
df.printSchema()
df.count()


root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



8056

In [99]:
# filter by actions for song plays (page = NextSong)

df = df.select("*").where(df.page == "NextSong")
df.count()

6820

In [21]:
 # extract columns for users table 

users_table = df.select(["userId","firstName","lastName","gender","level"])

users_table = users_table.withColumnRenamed("userId", "user_id") \
                         .withColumnRenamed("firstName","first_name") \
                         .withColumnRenamed("lastName","last_name")


In [22]:
users_table.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [23]:
# write users table to parquet files
users_table.write.save("data/output/users/", format="parquet", header=True)

In [52]:
df_pd = df.toPandas()
df_pd.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [117]:
# create timestamp column from original timestamp column
get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), t.TimestampType())
df = df.withColumn("start_time", get_timestamp(df.ts))

df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)



In [118]:
# create datetime column from original timestamp column
get_datetime = udf(lambda x: from_unixtime(x), t.DateType())

df = df.withColumn("date_time", to_date(df.start_time))
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- date_time: date (nullable = true)



In [119]:
df_pd = df.toPandas()
df_pd.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,start_time,date_time
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26.796,2018-11-15
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21.796,2018-11-15
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:45:41.796,2018-11-15
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9,2018-11-15 01:57:51.796,2018-11-15
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12,2018-11-15 03:29:37.796,2018-11-15


In [120]:
# extract columns to create time table
df = df.withColumn("hour", hour(col("start_time"))) \
    .withColumn("day", dayofmonth(col("start_time"))) \
    .withColumn("week", weekofyear(col("start_time"))) \
    .withColumn("month", month(col("start_time"))) \
    .withColumn("year", year(col("start_time"))) \
    .withColumn("weekday", date_format(col("start_time"),"EEEE"))
    
time_table = df.select("start_time", "hour", "day", "week", "month", "year", "weekday")
    

In [121]:
time_table.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: string (nullable = true)



In [122]:
# write time table to parquet files partitioned by year and month
time_table.write.save("data/output/time/", format="parquet", header=True)

In [123]:
# read in song data to use for songplays table
song_df = spark.read.parquet("data/output/songs")


In [124]:
song_df.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: integer (nullable = true)



In [135]:
# extract columns from joined song and log datasets to create songplays table 
songplays_table = df.join(song_df, (df.song == song_df.title) & (df.length == song_df.duration), "inner") \
                    .select(df.start_time, col("userId").alias("user_id"), df.level, \
                            song_df.song_id, song_df.artist_id, col("sessionId").alias("session_id"), \
                            df.location, col("userAgent").alias("user_agent")) \
                    .withColumn("songplay_id",F.monotonically_increasing_id())

songplays_table = songplays_table.select("songplay_id","user_id","level","song_id","artist_id","session_id","location","user_agent")

#https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.join.html+

In [136]:
songplays_table.printSchema()

root
 |-- songplay_id: long (nullable = false)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)



In [137]:

# write songplays table to parquet files partitioned by year and month
songplays_table.write.save("data/output/songplays/", format="parquet", header=True)

# Path adaptation

In [138]:
input_data = "s3a://udacity-dend/" 
song_data = input_data+"song_data/*/*/*/*.json"

print(song_data)

s3a://udacity-dend/song_data/*/*/*/*.json
