In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, dayofweek, date_format
from pyspark.sql.types import StructType as Struct, StructField as Fld, DoubleType as Double, StringType as Str, IntegerType as Int, LongType as Long

In [2]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark
spark = create_spark_session()

In [3]:
songs_schema = Struct([
    Fld('num_songs', Int()),
    Fld('artist_id', Str()),
    Fld('artist_latitude', Double()),
    Fld('artist_longtitude', Double()),
    Fld('artist_location', Str()),
    Fld('artist_name', Str()),
    Fld('song_id', Str()),
    Fld('title', Str()),
    Fld('duration', Double()),
    Fld('year', Int())
])

In [4]:
df = spark.read.json('song_data/*/*/*/*.json', songs_schema)

In [5]:
songs_table = df[['song_id', 'title', 'artist_id', 'year', 'duration']].dropDuplicates(['song_id'])

In [6]:
songs_table.write.parquet(os.path.join('./', 'songs_table'), 'overwrite', partitionBy=['year', 'artist_id'])

In [7]:
songs_table.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOMZWCG12A8C13C480|    I Didn't Mean To|ARD7TVE1187B99BFB1|   0|218.93179|
|SOUPIRU12A6D4FA1E1| Der Kleine Dompfaff|ARJIE2Y1187B994AB7|   0|152.92036|
|SOXVLOJ12AB0189215|     Amor De Cabaret|ARKRRTF1187B9984DA|   0|177.47546|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [10]:
artists_table = df[['artist_id', 'artist_name', 'artist_location',
                      'artist_latitude', 'artist_longtitude']].dropDuplicates(['artist_id'])

In [11]:
artists_table.write.parquet(os.path.join('./', 'artist_table'), 'overwrite')

In [12]:
artists_table.show(5)

+------------------+--------------------+--------------------+---------------+-----------------+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longtitude|
+------------------+--------------------+--------------------+---------------+-----------------+
|AR9AWNF1187B9AB0B4|Kenny G featuring...|Seattle, Washingt...|           null|             null|
|AR0IAWL1187B9A96D0|        Danilo Perez|              Panama|         8.4177|             null|
|AR0RCMP1187FB3F427|    Billie Jo Spears|        Beaumont, TX|       30.08615|             null|
|AREDL271187FB40F44|        Soul Mekanik|                    |           null|             null|
|ARI3BMM1187FB4255E|        Alice Stuart|          Washington|        38.8991|             null|
+------------------+--------------------+--------------------+---------------+-----------------+
only showing top 5 rows



In [13]:
df = spark.read.json('log_data/*.json')

In [14]:
df = df.filter(df.page == 'NextSong')
df = df.withColumn('userId', df['userId'].cast('integer'))

In [15]:
users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']].dropDuplicates(['userId'])

In [16]:
users_table.write.parquet(os.path.join('./', 'users_table'), 'overwrite')

In [17]:
users_table.show(5)

+------+---------+--------+------+-----+
|userId|firstName|lastName|gender|level|
+------+---------+--------+------+-----+
|    85|  Kinsley|   Young|     F| paid|
|    65|    Amiya|Davidson|     F| paid|
|    53|  Celeste|Williams|     F| free|
|    78|    Chloe|    Roth|     F| free|
|    34|   Evelin|   Ayala|     F| free|
+------+---------+--------+------+-----+
only showing top 5 rows



In [18]:
get_timestamp = udf(lambda x: x / 1000, Double())

In [19]:
df = df.withColumn('startTime', get_timestamp(df.ts))

In [20]:
df = df.withColumn('datetime', df.startTime.cast('timestamp'))

In [21]:
time_table = df[df.startTime, 
                hour(df.datetime).alias('hour'), 
                dayofmonth(df.datetime).alias('day'),
                weekofyear(df.datetime).alias('week'), 
                month(df.datetime).alias('month'), 
                year(df.datetime).alias('year'),
                dayofweek(df.datetime).alias('weekday')]

In [22]:
time_table.write.parquet(os.path.join('./', 'time_table'), 'overwrite', partitionBy=['year', 'month'])

In [23]:
time_table.show(5)

+----------------+----+---+----+-----+----+-------+
|       startTime|hour|day|week|month|year|weekday|
+----------------+----+---+----+-----+----+-------+
|1.542241826796E9|   0| 15|  46|   11|2018|      5|
|1.542242481796E9|   0| 15|  46|   11|2018|      5|
|1.542242741796E9|   0| 15|  46|   11|2018|      5|
|1.542253449796E9|   3| 15|  46|   11|2018|      5|
|1.542260935796E9|   5| 15|  46|   11|2018|      5|
+----------------+----+---+----+-----+----+-------+
only showing top 5 rows



In [24]:
song_df = spark.read.json('song_data/*/*/*/*.json', songs_schema)

In [26]:
songplays_cols = ('datetime', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent')
conditions = [df.song == song_df.title, df.artist == song_df.artist_name, df.length == song_df.duration]

In [27]:
songplays_table = df.join(song_df, conditions).select(*songplays_cols)

In [29]:
songplays_table = songplays_table.withColumn('month', month(df.datetime))
songplays_table = songplays_table.withColumn('year', year(df.datetime))

In [30]:
songplays_table.write.parquet(os.path.join('./', 'songplays_table'), 'overwrite', partitionBy=['year', 'month'])

In [34]:
songplays_table[songplays_table.columns[:5]].show(5)
songplays_table[songplays_table.columns[5:]].show(5)

+--------------------+------+-----+------------------+------------------+
|            datetime|userId|level|           song_id|         artist_id|
+--------------------+------+-----+------------------+------------------+
|2018-11-21 21:56:...|    15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|
+--------------------+------+-----+------------------+------------------+

+---------+--------------------+--------------------+-----+----+
|sessionId|            location|           userAgent|month|year|
+---------+--------------------+--------------------+-----+----+
|      818|Chicago-Napervill...|"Mozilla/5.0 (X11...|   11|2018|
+---------+--------------------+--------------------+-----+----+

