# Testing Notebook 01

In [1]:
# count statistics
stats = {}

## Extract AWS keys from config file & store into environment variables

In [None]:
import configparser
import os

config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"] = config.get('AWS','KEY')
os.environ["AWS_SECRET_ACCESS_KEY"] = config.get('AWS','SECRET')

## Create spark session

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

# Define schemas


In [4]:
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl
from pyspark.sql.types import StringType as Str, IntegerType as Int, DateType as Date, LongType as Long

song_schema = R([
    Fld("artist_id", Str()),
    Fld("artist_latitude", Dbl()),
    Fld("artist_location", Str()),
    Fld("artist_longitude", Dbl()),
    Fld("artist_name", Str()),
    Fld("duration", Dbl()),
    Fld("num_songs", Long()),
    Fld("song_id", Str()),
    Fld("title", Str()),
    Fld("year", Long())
])

log_schema = R([
    Fld("artist", Str()),
    Fld("auth", Str()),
    Fld("firstName", Str()),
    Fld("gender", Str()),
    Fld("itemInSession", Long()),
    Fld("lastName", Str()),
    Fld("length", Dbl()),
    Fld("level", Str()),
    Fld("location", Str()),
    Fld("method", Str()),
    Fld("page", Str()),
    Fld("registration", Dbl()),
    Fld("sessionId", Long()),
    Fld("song", Str()),
    Fld("status", Long()),
    Fld("ts", Long()),
    Fld("userAgent", Str()),
    Fld("userId", Str())
])

## Load schema from S3

Access the public s3 bucket from http via: https://s3.console.aws.amazon.com/s3/buckets/udacity-dend/

In [5]:
# Declare S3 locations
s3_song = "s3://udacity-dend/song_data"
s3_log = "s3://udacity-dend/log_data"
input_data = "s3a://udacity-dend/"
output_data = "s3://udacity-dend-project-output-1995/"

In [6]:
# Create filepath for log data
log_data_folder = os.path.join(input_data, "log-data/")
log_files = "{}*/*/*events.json".format(log_data_folder)
log_files

's3a://udacity-dend/log-data/*/*/*events.json'

In [7]:
# Create filepath for song data
song_data_folder = os.path.join(input_data, "song-data/")
song_files = "{}*/*/*/*.json".format(song_data_folder)
song_files

's3a://udacity-dend/song-data/*/*/*/*.json'

In [8]:
# read song data file. NTS we have a lot of song data. 
# So only select a small amount for testing purposes.
song_files = "{}A/A/A/*.json".format(song_data_folder) # Select a small subset of the data
# song_files = "{}*/*/*/*.json".format(song_data_folder) # Select all the data
# song_files = "{}A/A/*/*.json".format(song_data_folder) # Only select files in the first A folder
print("Song files: ", song_files)
df_song = spark.read.json(song_files, schema = song_schema).dropDuplicates().cache()

Song files:  s3a://udacity-dend/song-data/A/A/A/*.json


In [9]:
# read log data file
df_log = spark.read.json(log_files, schema = log_schema).dropDuplicates().cache()

# Filter by nextSong action
df_log = df_log.filter(df_log.page == "NextSong").cache()

In [10]:
print("Song count: ", df_song.count())
print("Log count: ", df_log.count())

stats["df_log size"] = df_log.count()
stats["df_song size"] = df_song.count()

Song count:  24
Log count:  6820


In [11]:
# View the df schemas
df_log.printSchema()
df_log.show(5)

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|            artist|     auth|firstName|gender|itemInSession| lastName|   length

In [12]:
# View the df schemas
df_song.printSchema()
df_song.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|             title|year|
+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+------------------+----+
|ARJNIUY12298900C91|           null|                    |            null|        Adelitas Way| 213.94

### NTS: Import whole data using the above schema

## Recall the flow of data
- Table: time
    - start_time - ts from df_log
    - hour - transform from start_time
    - day - transform from start_time
    - week - transform from start_time
    - month - transform from start_time
    - year - transform from start_time
- Table: users 
    - user_id - userId from df_log
    - first_name - firstName from df_logs
    - last_name - lastName from df_logs 
    - gender - gender from df_logs
    - level - level from df_logs
- Table: artists
    - artist_id - artist_id from df_song
    - name - artist_name from df_song
    - location - artist_location from df_song
    - lattitude - artist_latitude from df_song
    - longitude - artst_longitude from df_song
- Table: songs
    - song_id - song_id from df_song
    - title - title from df_song
    - artist_id - artist_id from df_song
    - year - year from df_song
    - duration - duration from df_song
- Table: songplays
    - songplay_id -
    - start_time -
    - user_id -
    - level -
    - song_id -
    - artist_id - 
    - session_id - 
    - location - 
    - user_agent - 

## Table: time

In [13]:
df_log.select("ts").dropDuplicates().sort("ts").show(5)

+-------------+
|           ts|
+-------------+
|1541106106796|
|1541106352796|
|1541106496796|
|1541106673796|
|1541107053796|
+-------------+
only showing top 5 rows



In [14]:
# Test a function for parsing to datetime
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType

# Input string is of type long. But since long doesn't exist in Python, use int.
sample_time_string_1 = int(1542296032796)
sample_time_string_2 = int(1541106496796)  

def convert_to_datetime(text):
    
    obj = datetime.fromtimestamp(text / 1000)
    return obj

datetime_obj = (convert_to_datetime(sample_time_string_1))
print(datetime_obj)

2018-11-15 15:33:52.796000


In [36]:
# Obtain the time stamp column
df_time = df_log.select("ts")
get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
df_time = df_time.withColumn("start_time", get_timestamp("ts"))
# df_time = df_time.drop("ts")
df_time.printSchema()

root
 |-- ts: long (nullable = true)
 |-- start_time: timestamp (nullable = true)



In [37]:
# Get the other columns: hour, day, week, month, year, weekday
from pyspark.sql.functions import hour, dayofmonth, month, year, weekofyear, dayofweek

df_time = df_time.withColumn("hour", hour("start_time"))
df_time = df_time.withColumn("day", dayofmonth("start_time"))
df_time = df_time.withColumn("month", month("start_time"))
df_time = df_time.withColumn("year", year("start_time"))
df_time = df_time.withColumn("week", weekofyear("start_time"))
df_time = df_time.withColumn("weekday", dayofweek("start_time"))

df_time.printSchema()
df_time.show(5)

root
 |-- ts: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)

+-------------+--------------------+----+---+-----+----+----+-------+
|           ts|          start_time|hour|day|month|year|week|weekday|
+-------------+--------------------+----+---+-----+----+----+-------+
|1542296032796|2018-11-15 15:33:...|  15| 15|   11|2018|  46|      5|
|1542299023796|2018-11-15 16:23:...|  16| 15|   11|2018|  46|      5|
|1542318319796|2018-11-15 21:45:...|  21| 15|   11|2018|  46|      5|
|1542321121796|2018-11-15 22:32:...|  22| 15|   11|2018|  46|      5|
|1542786093796|2018-11-21 07:41:...|   7| 21|   11|2018|  47|      4|
+-------------+--------------------+----+---+-----+----+----+-------+
only showing top 5 rows



In [41]:
# Create a datetime column
from pyspark.sql import functions as f
from pyspark.sql import types as t
df_time = df_time.withColumn('datetime', f.to_date(df_time.ts.cast(dataType=t.TimestampType())))
df_time.printSchema()
df_time.show(5)

root
 |-- ts: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- datetime: date (nullable = true)

+-------------+--------------------+----+---+-----+----+----+-------+-----------+
|           ts|          start_time|hour|day|month|year|week|weekday|   datetime|
+-------------+--------------------+----+---+-----+----+----+-------+-----------+
|1542296032796|2018-11-15 15:33:...|  15| 15|   11|2018|  46|      5|50843-06-01|
|1542299023796|2018-11-15 16:23:...|  16| 15|   11|2018|  46|      5|50843-07-06|
|1542318319796|2018-11-15 21:45:...|  21| 15|   11|2018|  46|      5|50844-02-14|
|1542321121796|2018-11-15 22:32:...|  22| 15|   11|2018|  46|      5|50844-03-17|
|1542786093796|2018-11-21 07:41:...|   7| 21|   11|2018|  47|      4|50858-12-11

In [17]:
# Get distinct rows only
from pyspark.sql.functions import col

time_table = df_time.select(col("start_time"), col("hour"), col("day"), col("week"), \
                           col("month"), col("year"), col("weekday")).distinct()

## Write the parquet file to local storage instead since cannot access S3

In [18]:
output_folder = "/home/workspace/parquet_output"
output = os.path.join(output_folder, 'time.parquet')
output

'/home/workspace/parquet_output/time.parquet'

In [19]:
# write time table to parquet files partitioned by year and month
time_table.write.partitionBy("year", "month").parquet(output, 'overwrite')

stats["df_time"] = time_table.count()

# Table: users

In [20]:
# Extract user data
df_users = df_log.select(col("firstName"), col("lastName"), col("gender"), col("level"), col("userId")).distinct()
df_users.printSchema()
df_users.show(5)

root
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)
 |-- userId: string (nullable = true)

+---------+--------+------+-----+------+
|firstName|lastName|gender|level|userId|
+---------+--------+------+-----+------+
|Katherine|     Gay|     F| free|    57|
|  Shakira|    Hunt|     F| free|    84|
|     Sean|  Wilson|     F| free|    22|
| Theodore|   Smith|     M| free|    52|
|    Tegan|  Levine|     F| paid|    80|
+---------+--------+------+-----+------+
only showing top 5 rows



In [21]:
# Get output path for user parquet
output = os.path.join(output_folder, 'users.parquet')

# write user table
df_users.write.parquet(output, 'overwrite')

stats["df_users"] = df_users.count()

# Table: artists

In [22]:
# Select artists columns
df_artists = df_song.select(col("artist_id"), col("artist_name"), col("artist_location"), \
                               col("artist_longitude"), col("artist_latitude")).distinct()

df_artists.printSchema()
df_artists.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_latitude: double (nullable = true)

+------------------+-------------+---------------+----------------+---------------+
|         artist_id|  artist_name|artist_location|artist_longitude|artist_latitude|
+------------------+-------------+---------------+----------------+---------------+
|ARC1IHZ1187FB4E920| Jamie Cullum|               |            null|           null|
|ARZKCQM1257509D107|   Dataphiles|               |            null|           null|
|AREWD471187FB49873|     Son Kite|               |            null|           null|
|ARGE7G11187FB37E05| Cyndi Lauper|   Brooklyn, NY|            null|           null|
|ARSVTNL1187B992A91|Jonathan King|London, England|        -0.12714|       51.50632|
+------------------+-------------+---------------+----------------+---------------+
only showing

In [23]:
# Get output path for artist parquet file
output = os.path.join(output_folder, 'artist.parquet')

# write artist table
df_artists.write.parquet(output, 'overwrite')

stats["df_artists"] = df_artists.count()

## Table: Songs

In [24]:
# Select song columns
df_songs_table = df_song.select(col("song_id"), col("title"), col("artist_id"), col("year"), col("duration")).distinct()

df_songs_table.printSchema()
df_songs_table.show(5)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: long (nullable = true)
 |-- duration: double (nullable = true)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SODZYPO12A8C13A91E|Burn My Body (Alb...|AR1C2IX1187B99BF74|   0|177.99791|
|SOIGHOD12A8C13B5A1|        Indian Angel|ARY589G1187B9A9F4E|2004|171.57179|
|SOOVHYF12A8C134892|     I'll Be Waiting|ARCLYBR1187FB53913|1989|304.56118|
|SOAPERH12A58A787DC|The One And Only ...|ARZ5H0P1187B98A1DD|   0|230.42567|
|SOHKNRJ12A6701D1F8|        Drop of Rain|AR10USD1187B99F3F1|   0|189.57016|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [35]:
# Get output path for songs parquet file
output = os.path.join(output_folder, 'song.parquet')

# write songs table partition by year and artists
df_songs_table.write.partitionBy("year", "artist_id").parquet(output, 'overwrite')

stats["df_songs_table"] = df_songs_table.count()

# Table: songplays

In [26]:
# Register the SQL table as we have to run queries against it
df_song.createOrReplaceTempView("song_table_raw")

In [27]:
# Use the df_time table as we require the timestamp column
get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
df_time = df_log
df_time = df_time.withColumn("start_time", get_timestamp("ts"))
df_time = df_time.withColumn("month", month("start_time"))
df_time = df_time.withColumn("year", year("start_time"))
df_time.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)



In [28]:
# Extract unique song id, artist id, artist name
song_df = spark.sql("SELECT DISTINCT song_id, artist_id, artist_name FROM song_table_raw")

In [29]:
temp_table = df_time.join(song_df, song_df.artist_name == df_time.artist, "inner").distinct()
temp_table.printSchema()
temp_table.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)



10

In [30]:
from pyspark.sql.functions import monotonically_increasing_id

df_songplays = temp_table.select(["start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year"])

df_songplays.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- userId: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- location: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)



In [31]:
# Add the id column
df_songplays = df_songplays.withColumn("songplay_id", monotonically_increasing_id()  )

In [32]:
# Get output path for songplays parquet
output = os.path.join(output_folder, 'songplays.parquet')
output

'/home/workspace/parquet_output/songplays.parquet'

In [50]:
# write user table
df_songplays.write.partitionBy("year", "month").parquet(output, 'overwrite')

stats["df_songplays"] = df_songplays.count()

In [34]:
print("OVerall statistics: ")
print(stats)

OVerall statistics: 
{'df_log size': 6820, 'df_song size': 24, 'df_time': 6813, 'df_users': 104, 'df_artists': 24, 'df_songs_table': 24, 'df_songplays': 10}


# Read parquet file of song table


In [44]:
song_parquet_filepath = os.path.join(output_folder, 'song.parquet')

In [45]:
parquetFile = spark.read.parquet(song_parquet_filepath)

In [46]:
parquetFile.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)



In [47]:
parquetFile.show(5)

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOBTCUI12A8AE48B70|Faust: Ballet Mus...| 94.56281|   0|ARSUVLW12454A4C8B8|
|SOVNKJI12A8C13CB0D|Take It To Da Hou...|227.10812|2001|ARWUNH81187FB4A3E0|
|SOYVBGZ12A6D4F92A8|Piano Sonata No. ...|221.70077|   0|ARLRWBW1242077EB29|
|SODBHKO12A58A77F36|Fingers Of Love (...|335.93424|   0|ARKGS2Z1187FB494B5|
|SOGXFIF12A58A78CC4|Hanging On (Mediu...|204.06812|   0|AR5LZJD1187FB4C5E5|
+------------------+--------------------+---------+----+------------------+
only showing top 5 rows



In [56]:
# read in song and artist data to use for songplays table
song_parquet_filepath = os.path.join(output_folder, 'song.parquet')
song_df = spark.read.parquet(song_parquet_filepath)

artist_parquet_filepath = os.path.join(output_folder, 'artist.parquet')
artist_df = spark.read.parquet(artist_parquet_filepath)

song_df.printSchema()
artist_df.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_latitude: double (nullable = true)



In [73]:
# Join the song and artist df
temp_table = song_df.join(artist_df, ['artist_id'], "inner").distinct()
temp_table.printSchema()
temp_table.count()
temp_table.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_latitude: double (nullable = true)

+------------------+------------------+--------------------+---------+----+----------------+---------------+----------------+---------------+
|         artist_id|           song_id|               title| duration|year|     artist_name|artist_location|artist_longitude|artist_latitude|
+------------------+------------------+--------------------+---------+----+----------------+---------------+----------------+---------------+
|ARGE7G11187FB37E05|SONRWUU12AF72A4283|  Into The Nightlife|240.63955|2008|    Cyndi Lauper|   Brooklyn, NY|            null|           null|
|ARBZIN01187FB362CC|SOERIDA12A6D4F8506|I Want Y

In [71]:
temp_table.createOrReplaceTempView("song_table")

In [72]:
unique_song = spark.sql("SELECT DISTINCT song_id, artist_id, artist_name FROM song_table")
unique_song.printSchema()
unique_song.count()

root
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)



24

## Test writing output file to my own bucket
Access my s3 bucket from http via: https://s3.console.aws.amazon.com/s3/buckets/udacity-dend-project-output-1995/

In [60]:
import boto3

s3 = boto3.resource(
    's3',
    region_name='us-east-2',
    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
)

content="String content to write to a new S3 file"
s3.Object('udacity-dend-project-output-1995', 'newfile.txt').put(Body=content)

{'ResponseMetadata': {'RequestId': '8E116C0B02D2ABC4',
  'HostId': '7992db/s2B+SPF+4ILkUrZj3Pb+JH5BEMF2TgPBTSS+r5w/NsayqZYLEqfBSyINpeWjRWYI0/8JW4m1KnB2McA==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '7992db/s2B+SPF+4ILkUrZj3Pb+JH5BEMF2TgPBTSS+r5w/NsayqZYLEqfBSyINpeWjRWYI0/8JW4m1KnB2McA==',
   'x-amz-request-id': '8E116C0B02D2ABC4',
   'date': 'Thu, 21 Jan 2021 05:39:13 GMT',
   'etag': '"4a4ba548fe7ddb965593f41a13e1df90"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"4a4ba548fe7ddb965593f41a13e1df90"'}

0