### Test queries

In [1]:
import configparser
import os
from pyspark.sql import SparkSession

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [4]:
#output_data = "s3a://sparkify-bp/analytics/"
output_data = "analytics/"

In [5]:
# read what's in the songs table
songs_df = spark.read.parquet(output_data + "songs.parquet")

songs_df.printSchema()
songs_df.describe("song_id").show() 
songs_df.limit(4).toPandas()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

+-------+------------------+
|summary|           song_id|
+-------+------------------+
|  count|                72|
|   mean|              null|
| stddev|              null|
|    min|***UNKNOWN_SONG***|
|    max|SOZVMJI12AB01808AF|
+-------+------------------+



Unnamed: 0,song_id,title,duration,year,artist_id
0,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,43.36281,2000,ARPBNLO1187FB3D52F
1,SONYPOM12A8C13B2D7,I Think My Wife Is Running Around On Me (Taco ...,186.48771,2005,ARDNS031187B9924F0
2,SODREIN12A58A7F2E5,A Whiter Shade Of Pale (Live @ Fillmore West),326.00771,0,ARLTWXK1187FB5A3F8
3,SOYMRWW12A6D4FAB14,The Moon And I (Ordinary Day Album Version),267.7024,0,ARKFYS91187B98E58F


In [6]:
# register temp view for songs dimension
songs_df.createOrReplaceTempView("songs") 

In [7]:
q1 = spark.sql(
"""
SELECT song_id, title, artist_id, year, duration
  FROM songs
 ORDER BY song_id
""")

q1.limit(4).toPandas()

Unnamed: 0,song_id,title,artist_id,year,duration
0,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,0,0.0
1,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,ARPBNLO1187FB3D52F,2000,43.36281
2,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,ARDR4AC1187FB371A1,0,511.16363
3,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,ARMAC4T1187FB3FA4C,2004,207.77751


In [8]:
q2 = spark.sql(
"""
SELECT COUNT(song_id) AS count,
       COUNT(DISTINCT song_id) as count_distinct
  FROM songs
""")

q2.show()

+-----+--------------+
|count|count_distinct|
+-----+--------------+
|   72|            72|
+-----+--------------+



In [9]:
# read what's in the artists table
artists_df = spark.read.parquet(output_data + "artists.parquet")

artists_df.printSchema()
artists_df.describe("artist_id").show() 
artists_df.limit(4).toPandas()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+-------+--------------------+
|summary|           artist_id|
+-------+--------------------+
|  count|                  70|
|   mean|                null|
| stddev|                null|
|    min|***UNKNOWN_ARTIST***|
|    max|  ARYKCQI1187FB3B18F|
+-------+--------------------+



Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARMAC4T1187FB3FA4C,The Dillinger Escape Plan,"Morris Plains, NJ",40.82624,-74.47995
1,ARNF6401187FB57032,Sophie B. Hawkins,"New York, NY [Manhattan]",40.79086,-73.96644
2,AROUOZZ1187B9ABE51,Willie Bobo,"New York, NY [Spanish Harlem]",40.79195,-73.94512
3,ARI2JSK1187FB496EF,Nick Ingman;Gavyn Wright,"London, England",51.50632,-0.12714


In [10]:
# register temp view for artists dimension
artists_df.createOrReplaceTempView("artists") 

In [11]:
q3 = spark.sql(
"""
SELECT *
  FROM artists 
 ORDER BY artist_id
""")

q3.limit(4).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,***UNKNOWN_ARTIST***,*** Unknown Artist ***,,0.0,0.0
1,AR051KA1187B98B2FF,Wilks,,,
2,AR0IAWL1187B9A96D0,Danilo Perez,Panama,8.4177,-80.11278
3,AR0RCMP1187FB3F427,Billie Jo Spears,"Beaumont, TX",30.08615,-94.10158


In [12]:
q4 = spark.sql(
"""
SELECT COUNT(artist_id) AS count,
       COUNT(DISTINCT artist_id) as count_distinct
  FROM artists
""")

q4.show()

+-----+--------------+
|count|count_distinct|
+-----+--------------+
|   70|            70|
+-----+--------------+



In [13]:
q5 = spark.sql(
"""
SELECT a.artist_id, a.name, a.location, a.latitude, a.longitude, s.song_id, s.title, s.duration, s.year
  FROM artists a
  JOIN songs s ON a.artist_id = s.artist_id
 ORDER BY s.song_id
""")

q5.describe("song_id").show() 
q5.limit(4).toPandas()

+-------+------------------+
|summary|           song_id|
+-------+------------------+
|  count|                72|
|   mean|              null|
| stddev|              null|
|    min|***UNKNOWN_SONG***|
|    max|SOZVMJI12AB01808AF|
+-------+------------------+



Unnamed: 0,artist_id,name,location,latitude,longitude,song_id,title,duration,year
0,***UNKNOWN_ARTIST***,*** Unknown Artist ***,,0.0,0.0,***UNKNOWN_SONG***,***Unknown Song***,0.0,0
1,ARPBNLO1187FB3D52F,Tiny Tim,"New York, NY",40.71455,-74.00712,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,43.36281,2000
2,ARDR4AC1187FB371A1,Montserrat Caballé;Placido Domingo;Vicente Sar...,,,,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,511.16363,0
3,ARMAC4T1187FB3FA4C,The Dillinger Escape Plan,"Morris Plains, NJ",40.82624,-74.47995,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,207.77751,2004


In [14]:
q5.limit(4).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude,song_id,title,duration,year
0,***UNKNOWN_ARTIST***,*** Unknown Artist ***,,0.0,0.0,***UNKNOWN_SONG***,***Unknown Song***,0.0,0
1,ARPBNLO1187FB3D52F,Tiny Tim,"New York, NY",40.71455,-74.00712,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,43.36281,2000
2,ARDR4AC1187FB371A1,Montserrat Caballé;Placido Domingo;Vicente Sar...,,,,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,511.16363,0
3,ARMAC4T1187FB3FA4C,The Dillinger Escape Plan,"Morris Plains, NJ",40.82624,-74.47995,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,207.77751,2004


In [15]:
# read what's in the song keys table
song_keys_df = spark.read.parquet(output_data + "song_keys.parquet")

song_keys_df.printSchema()
song_keys_df.describe("artist_id").show() 
song_keys_df.limit(4).toPandas()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)

+-------+------------------+
|summary|         artist_id|
+-------+------------------+
|  count|                71|
|   mean|              null|
| stddev|              null|
|    min|AR051KA1187B98B2FF|
|    max|ARYKCQI1187FB3B18F|
+-------+------------------+



Unnamed: 0,song_id,title,duration,artist_id,artist_name
0,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,511.16363,ARDR4AC1187FB371A1,Montserrat Caballé;Placido Domingo;Vicente Sar...
1,SONYPOM12A8C13B2D7,I Think My Wife Is Running Around On Me (Taco ...,186.48771,ARDNS031187B9924F0,Tim Wilson
2,SOPEGZN12AB0181B3D,Get Your Head Stuck On Your Neck,45.66159,AREDL271187FB40F44,Soul Mekanik
3,SODUJBS12A8C132150,Wessex Loses a Bride,111.62077,ARI2JSK1187FB496EF,Nick Ingman;Gavyn Wright


In [16]:
# register temp view for song keys table
song_keys_df.createOrReplaceTempView("song_keys") 

In [17]:
# read what's in the users table
users_df = spark.read.parquet(output_data + "users.parquet")

users_df.printSchema()
users_df.describe("user_id").show() 
users_df.limit(4).toPandas()

root
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+-----------------+
|summary|          user_id|
+-------+-----------------+
|  count|               96|
|   mean|         51.65625|
| stddev|29.05886154360708|
|    min|                2|
|    max|              101|
+-------+-----------------+



Unnamed: 0,user_id,first_name,last_name,gender,level
0,88,Mohammad,Rodriguez,M,free
1,4,Alivia,Terrell,F,free
2,55,Martin,Johnson,M,free
3,59,Lily,Cooper,F,free


In [18]:
# register temp view for users dimension
users_df.createOrReplaceTempView("users") 

In [19]:
q6 = spark.sql(
"""
SELECT *
  FROM users
 ORDER BY user_id
""")

q6.limit(4).toPandas()

Unnamed: 0,user_id,first_name,last_name,gender,level
0,2,Jizelle,Benjamin,F,free
1,3,Isaac,Valdez,M,free
2,4,Alivia,Terrell,F,free
3,5,Elijah,Davis,M,free


In [20]:
q6.limit(4).toPandas()

Unnamed: 0,user_id,first_name,last_name,gender,level
0,2,Jizelle,Benjamin,F,free
1,3,Isaac,Valdez,M,free
2,4,Alivia,Terrell,F,free
3,5,Elijah,Davis,M,free


In [21]:
# read what's in the time table
time_df = spark.read.parquet(output_data + "time.parquet")

time_df.printSchema()
time_df.describe("day").show() 
time_df.limit(4).toPandas()

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------+------------------+
|summary|               day|
+-------+------------------+
|  count|              6813|
|   mean|17.253192426243945|
| stddev| 8.109173624234918|
|    min|                 1|
|    max|                30|
+-------+------------------+



Unnamed: 0,start_time,hour,day,week,weekday,year,month
0,2018-11-15 18:48:45,18,15,46,5,2018,11
1,2018-11-15 19:26:51,19,15,46,5,2018,11
2,2018-11-21 03:39:23,3,21,47,4,2018,11
3,2018-11-21 13:52:04,13,21,47,4,2018,11


In [22]:
# register temp view for time dimension
time_df.createOrReplaceTempView("time") 

In [23]:
q7 = spark.sql(
"""
SELECT *
  FROM time
 ORDER BY start_time
""")

q7.describe("day").show() 
q7.limit(4).toPandas()

+-------+------------------+
|summary|               day|
+-------+------------------+
|  count|              6813|
|   mean|17.253192426243945|
| stddev|  8.10917362423492|
|    min|                 1|
|    max|                30|
+-------+------------------+



Unnamed: 0,start_time,hour,day,week,weekday,year,month
0,2018-11-01 21:01:46,21,1,44,5,2018,11
1,2018-11-01 21:05:52,21,1,44,5,2018,11
2,2018-11-01 21:08:16,21,1,44,5,2018,11
3,2018-11-01 21:11:13,21,1,44,5,2018,11


In [24]:
q7.limit(4).toPandas()

Unnamed: 0,start_time,hour,day,week,weekday,year,month
0,2018-11-01 21:01:46,21,1,44,5,2018,11
1,2018-11-01 21:05:52,21,1,44,5,2018,11
2,2018-11-01 21:08:16,21,1,44,5,2018,11
3,2018-11-01 21:11:13,21,1,44,5,2018,11


In [25]:
q7a = spark.sql(
"""
SELECT t.year, t.month, t.week, t.day, count(*)
  FROM time t 
 GROUP BY t.year, t.month, t.week, t.day
 ORDER BY t.year, t.month, t.week, t.day
""")

q7a.limit(31).toPandas()

Unnamed: 0,year,month,week,day,count(1)
0,2018,11,44,1,11
1,2018,11,44,2,155
2,2018,11,44,3,100
3,2018,11,44,4,144
4,2018,11,45,5,356
5,2018,11,45,6,154
6,2018,11,45,7,174
7,2018,11,45,8,161
8,2018,11,45,9,252
9,2018,11,45,10,87


In [26]:
# read what's in the songplays table
songplays_df = spark.read.parquet(output_data + "songplays.parquet")

songplays_df.printSchema()
songplays_df.describe("session_id").show() 
songplays_df.limit(4).toPandas()

root
 |-- songplay_id: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------+------------------+
|summary|        session_id|
+-------+------------------+
|  count|              6820|
|   mean| 599.1818181818181|
| stddev|284.95333284318497|
|    min|                 3|
|    max|              1114|
+-------+------------------+



Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent,year,month
0,000026.000583.1542241826796,2018-11-15 00:30:26,26,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11
1,000026.000583.1542242481796,2018-11-15 00:41:21,26,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11
2,000026.000583.1542242741796,2018-11-15 00:45:41,26,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11
3,000061.000597.1542253449796,2018-11-15 03:44:09,61,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,597,"Houston-The Woodlands-Sugar Land, TX","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",2018,11


In [27]:
# register temp view for songplays fact
songplays_df.createOrReplaceTempView("songplays") 

In [28]:
q8 = spark.sql(
"""
SELECT start_time, user_id, level, song_id, artist_id, session_id, location
  FROM songplays
 ORDER BY start_time
""")

q8.limit(4).toPandas()

Unnamed: 0,start_time,user_id,level,song_id,artist_id,session_id,location
0,2018-11-01 21:01:46,8,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,139,"Phoenix-Mesa-Scottsdale, AZ"
1,2018-11-01 21:05:52,8,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,139,"Phoenix-Mesa-Scottsdale, AZ"
2,2018-11-01 21:08:16,8,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,139,"Phoenix-Mesa-Scottsdale, AZ"
3,2018-11-01 21:11:13,8,free,***UNKNOWN_SONG***,***UNKNOWN_ARTIST***,139,"Phoenix-Mesa-Scottsdale, AZ"


In [29]:
q9 = spark.sql(
"""
SELECT t.year, t.month, t.week, t.day, count(*)
  FROM songplays x JOIN time t ON x.start_time = t.start_time
 GROUP BY t.year, t.month, t.week, t.day
 ORDER BY t.year, t.month, t.week, t.day
""")

q9.limit(20).toPandas()

Unnamed: 0,year,month,week,day,count(1)
0,2018,11,44,1,11
1,2018,11,44,2,155
2,2018,11,44,3,100
3,2018,11,44,4,144
4,2018,11,45,5,356
5,2018,11,45,6,154
6,2018,11,45,7,174
7,2018,11,45,8,161
8,2018,11,45,9,252
9,2018,11,45,10,87


In [30]:
q9a = spark.sql(
"""
SELECT COUNT(*) AS count
  FROM songplays x JOIN time t ON x.start_time = t.start_time
""")
    
q9a.limit(1).show()

+-----+
|count|
+-----+
| 6820|
+-----+



In [31]:
q10 = spark.sql(
"""
SELECT count(*) AS count
  FROM songplays x JOIN users u ON x.user_id = u.user_id
""")
    
q10.limit(1).toPandas()

Unnamed: 0,count
0,6820


In [32]:
q11 = spark.sql(
"""
SELECT s.song_id, s.title, a.artist_id, a.name, u.user_id, u.last_name, u.first_name,
       t.year, t.month, t.day, p.session_id, p.level
  FROM songplays p
  JOIN time t on p.start_time = t.start_time
  JOIN users u on p.user_id=u.user_id
  JOIN songs s on p.song_id=s.song_id
  JOIN artists a on p.artist_id=a.artist_id
""")
    
q11.describe("session_id").show() 
q11.limit(4).toPandas()

+-------+-----------------+
|summary|       session_id|
+-------+-----------------+
|  count|             6820|
|   mean|599.1818181818181|
| stddev|284.9533328431849|
|    min|                3|
|    max|             1114|
+-------+-----------------+



Unnamed: 0,song_id,title,artist_id,name,user_id,last_name,first_name,year,month,day,session_id,level
0,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,97,Harrell,Kate,2018,11,15,605,paid
1,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,44,Kirby,Aleena,2018,11,15,619,paid
2,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,97,Harrell,Kate,2018,11,21,797,paid
3,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,37,Hicks,Jordan,2018,11,21,715,free


In [33]:
q11.limit(4).toPandas()

Unnamed: 0,song_id,title,artist_id,name,user_id,last_name,first_name,year,month,day,session_id,level
0,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,97,Harrell,Kate,2018,11,15,605,paid
1,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,44,Kirby,Aleena,2018,11,15,619,paid
2,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,97,Harrell,Kate,2018,11,21,797,paid
3,***UNKNOWN_SONG***,***Unknown Song***,***UNKNOWN_ARTIST***,*** Unknown Artist ***,37,Hicks,Jordan,2018,11,21,715,free


In [34]:
q11a = spark.sql(
"""
SELECT s.song_id, s.title, a.artist_id, a.name, u.user_id, u.last_name, u.first_name,
       t.year, t.month, t.day, p.session_id, p.level
  FROM songplays p
  JOIN time t on p.start_time = t.start_time
  JOIN users u on p.user_id=u.user_id
  JOIN songs s on p.song_id=s.song_id
  JOIN artists a on p.artist_id=a.artist_id
 WHERE s.song_id <> '***UNKNOWN_SONG***' OR a.artist_id <> '***UNKNOWN_ARTIST***'
""")
    
q11a.describe("session_id").show() 
q11a.limit(4).toPandas()

+-------+----------+
|summary|session_id|
+-------+----------+
|  count|         1|
|   mean|     818.0|
| stddev|       NaN|
|    min|       818|
|    max|       818|
+-------+----------+



Unnamed: 0,song_id,title,artist_id,name,user_id,last_name,first_name,year,month,day,session_id,level
0,SOZCTXZ12AB0182364,Setanta matins,AR5KOSW1187FB35FF4,Elena,15,Koch,Lily,2018,11,21,818,paid
