# The purpose of this notebook is to get the parquet data from s3 output bucket into tables 

In [4]:
import configparser
import os
from pyspark.sql import SparkSession

config = configparser.ConfigParser()
config.read('dl.cfg')

output_data = config['S3_BUCKET']['OUTPUT_DATA_S3A']

In [5]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.2.2")\
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain') \
    .getOrCreate()

In [6]:
songs_df = spark.read.parquet(os.path.join(output_data,'songs/*/*/*.parquet'))

In [7]:
songs_df.show(10)

+------------------+--------------------+------------------+----+----------+
|           song_id|               title|         artist_id|year|  duration|
+------------------+--------------------+------------------+----+----------+
|SOECXBG12A6D4F88EE|Le Bourgeois Gent...|ARTHSAE12131B4B70A|   0| 137.97832|
|SOCETGU12A8C1381E8|Pickin' & Singin'...|ARCS4GZ1187FB469EB|   0| 124.05506|
|SOCWCPW12A8C13C477|George Gershwin/ ...|AR18R7P1187B9A9570|   0| 370.59873|
|SODSKGE12A8C144435| Flowers On The Wall|ARJUNZD1187B9A5DAE|   0| 126.79791|
|SOTUPIO12A67021463|Take Me To The Wa...|ARZZXT51187FB4627E|   0| 254.35383|
|SOBGEIG12A6D4F6635|Raga Anandi Kalya...|ARSTA431187B9A3599|   0|1519.28118|
|SOYXCUN12A6310D884|The Grasshopper U...|ARLHO5Z1187FB4C861|   0| 181.10649|
|SOLVFLD12A8C1387BF|We Will Rock You ...|ARL4TII1187B9B46E1|   0| 186.14812|
|SOIPLJZ12A58A7F153|A Shawl Of Galway...|ARA04401187B991E6E|   0| 197.04118|
|SOLSRKX12A8C14466D|     Hey_ Snow White|ARFF4B41187FB48151|   0| 265.79546|

In [8]:
artists_df = spark.read.parquet(os.path.join(output_data,'artists/*.parquet'))

In [9]:
artists_df.show(10)

+------------------+--------------------+--------------------+---------+----------+
|         artist_id|                name|            location| latitude| longitude|
+------------------+--------------------+--------------------+---------+----------+
|AROU8WM1187B9B4620|   Milton Nascimento|Rio de Janeiro, B...|-22.97673| -43.19508|
|AR0ULD71187FB50317|       Desaparecidos| Omaha, Nebraska USA| 41.26069| -95.93995|
|AR5OH1V1187FB574C3|Kenny Wayne Shepherd|      Shreveport, LA|     null|      null|
|ARCBD0U1187FB466EF|Nelly / Lincoln U...|   HERNDON, Virginia|     null|      null|
|AR0F54F1187FB44536|      Ottmar Liebert|    Cologne, Germany| 50.94165|   6.95505|
|ARMQHX71187B9890D3|            Mastodon|         Atlanta, GA|     null|      null|
|ARFDCVP1187B9B40FC|The Geraldine Fib...|     Los Angeles, CA| 34.05349|-118.24532|
|ARF6ZTO1187FB3684F|   Story Of The Year|LONG BEACH, Calif...|     null|      null|
|ARKR0111187B99FD2A|Monica featuring ...|    Atlanta, Georgia|     null|    

In [10]:
users_df = spark.read.parquet(os.path.join(output_data,'users/*.parquet'))

In [11]:
users_df.show(10)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|    100|     Adler|  Barrera|     M| free|
|     89|   Kynnedi|  Sanchez|     F| free|
|     85|   Kinsley|    Young|     F| paid|
|     85|   Kinsley|    Young|     F| free|
|     64|    Hannah|  Calhoun|     F| free|
|     16|     Rylan|   George|     M| free|
|     66|     Kevin| Arellano|     M| free|
|     27|    Carlos|   Carter|     M| free|
|     38|    Gianna|    Jones|     F| free|
|     37|    Jordan|    Hicks|     F| free|
+-------+----------+---------+------+-----+
only showing top 10 rows



In [12]:
time_df = spark.read.parquet(os.path.join(output_data,'time/*/*/*.parquet'))

In [13]:
time_df.show(10)

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-20 10:59:...|  10| 20|  47|   11|2018|      3|
|2018-11-30 16:38:...|  16| 30|  48|   11|2018|      6|
|2018-11-16 23:34:...|  23| 16|  46|   11|2018|      6|
|2018-11-20 02:05:...|   2| 20|  47|   11|2018|      3|
|2018-11-16 10:12:...|  10| 16|  46|   11|2018|      6|
|2018-11-20 01:33:...|   1| 20|  47|   11|2018|      3|
|2018-11-16 14:07:...|  14| 16|  46|   11|2018|      6|
|2018-11-20 11:43:...|  11| 20|  47|   11|2018|      3|
|2018-11-16 19:06:...|  19| 16|  46|   11|2018|      6|
|2018-11-16 22:59:...|  22| 16|  46|   11|2018|      6|
+--------------------+----+---+----+-----+----+-------+
only showing top 10 rows



In [14]:
songplays_df = spark.read.parquet(os.path.join(output_data,'songplays/*/*/*.parquet'))

In [15]:
songplays_df.show(10)

+------------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
| songplay_id|          start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|
+------------+--------------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
| 17179869223|2018-11-05 02:59:...|     52| free|SOTEOMG12AB0184E70|ARAFI9R1187B9AC5B5|       226|Houston-The Woodl...|Mozilla/5.0 (Wind...|
| 17179869219|2018-11-12 18:51:...|     73| paid|SOBYGNT12A8C1378FB|AR0Q7DR1187B9AC35D|       294|Tampa-St. Petersb...|"Mozilla/5.0 (Mac...|
| 17179869213|2018-11-15 14:46:...|     30| paid|SOTCOTZ12A8C136BCB|AR7WK5411A348EF5EA|       324|San Jose-Sunnyval...|Mozilla/5.0 (Wind...|
| 17179869267|2018-11-20 17:50:...|     63| free|SOYEKUR12AAF3B5274|ARSU2921187FB51029|       729|      Santa Rosa, CA|"Mozilla/5.0 (Mac...|
| 42949672960