In [16]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql import types as py_types
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
# Create a Spark session
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()


#Song's table

In [19]:
song_schema = py_types.StructType([
                                    py_types.StructField("num_songs", py_types.IntegerType()),
                                    py_types.StructField("artist_id", py_types.StringType()),
                                    py_types.StructField("artist_latitude", py_types.FloatType()),
                                    py_types.StructField("artist_longitude", py_types.FloatType()),
                                    py_types.StructField("artist_location", py_types.StringType()),
                                    py_types.StructField("artist_name", py_types.StringType()),
                                    py_types.StructField("song_id", py_types.StringType()),
                                    py_types.StructField("title", py_types.StringType()),
                                    py_types.StructField("duration", py_types.FloatType()),
                                    py_types.StructField("year", py_types.IntegerType())
                                 ])

song_data = spark.read.json('work/data/song-data/song_data/*/*/*/*.json', schema=song_schema)
song_data.printSchema()

df_songs = song_data.select('song_id', 'title', 'artist_id', 'year', 'duration')
df_songs.write.parquet(path=os.path.join(os.getcwd(), 'work', 'data', 'songs.parquet'),
                       partitionBy=['year', 'artist_id'])

# Artists Table
df_artists = song_data.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude')\
    .withColumnRenamed('artist_name', 'name').withColumnRenamed('artist_location', 'location')\
    .withColumnRenamed('artist_latitude', 'latitude').withColumnRenamed('artist_longitude', 'longitude')
df_artists.write.parquet(path=os.path.join(os.getcwd(), 'work', 'data', 'artists.parquet'))

#Log's table