In [None]:
'''
In order to properly setup my local environment:
- pip install boto3
- I had to install Spark, Java and Hadoop on my Windows 10 machine,
following the instructions at this link: https://www.youtube.com/watch?v=g7Qpnmi0Q-s for the Java/Hadoop portion.
'''

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import types as T
import  pyspark.sql.functions as F
import time

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [6]:
spark = create_spark_session()
spark

In [7]:
# use this to speed up parquet write
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.algorithm.version", "2")

In [11]:
# Not sure that second line is faster than first
#df = spark.read.format("json").load("s3a://udacity-dend/song_data/*/*/*") # runs in 13 minutes
#df = spark.read.format("json").load(file_locations)
# df = spark.read.format("json").load(songs_paths)
# song_data = input_data+'song_data/*/*/*/*.json
# song_data = input_data + 'song_data/*/*/*/*.json # runs in 14 minutes
# df = spark.read.json(song_data) # runs in 14 minutes
# df = spark.read.json(songs_paths) # runs in 26 minutes

start = time.time()
input_data = "s3a://udacity-dend/"
song_data =  input_data + 'song_data/*/*/*/*.json'
df = spark.read.format("json").load("s3a://udacity-dend/song_data/*/*/*") # runs in 13 minutes
end = time.time()
print('runtime (s):', end-start)

runtime (s): 950.199615240097


In [11]:
start = time.time()
print(type(df))
print(df)
print('count', df.count())
#df.printSchema()
end = time.time()
print('runtime (s):', end-start)

<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[artist_id: string, artist_latitude: double, artist_location: string, artist_longitude: double, artist_name: string, duration: double, num_songs: bigint, song_id: string, title: string, year: bigint]
count 14896
runtime (s): 339.83151745796204


In [15]:
input_data = "s3a://udacity-dend/"
output_data = "s3a://dend-sparkify-amiri/"

In [17]:
# extract columns to create songs table
start = time.time()
songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct()
end = time.time()
print('extract columns to create songs table runtime (s):', end-start)

# write songs table to parquet files partitioned by year and artist
start = time.time()
songs_table_parquet = df.select("song_id", "title", "artist_id", "year", "duration", col("artist_name").alias("artist")).distinct()
songs_table_parquet.write.mode('overwrite').partitionBy("year", "artist_id").parquet(output_data + "songs/")
end = time.time()
print('write songs table to parquet files runtime (s):', end-start)

extract columns to create songs table runtime (s): 0.04885745048522949


In [16]:
# extract columns to create artists table
start = time.time()
artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude").distinct()
end = time.time()
print('extract columns to create artists table runtime (s):', end-start)

# write artists table to parquet files
start = time.time()
artists_table_parquet = artists_table.write.mode('overwrite').parquet(output_data + "artists/")
end = time.time()
print('write artists table to parquet files runtime (s):', end-start)

extract columns to create artists table runtime (s): 0.0378720760345459
write artists table to parquet files runtime (s): 330.1275327205658


## Put finalized code from above in sections below, then transfer to .py file. 

In [None]:
def process_song_data(spark, input_data, output_data):
    '''
    This function reads the data from S3, processes the song data using Spark, and writes the processed data back to S3.

    Parameters:
    - spark: The spark session
    - input_data: The S3 path location up to, but not including `song_data`
    - output_data: The S3 bucket where the new dimensional tables will be written to
    '''
    # get filepath to song data file
    start = time.time()
    song_data =  input_data + 'song_data/*/*/*/*.json'
    end = time.time()
    print('runtime (s):', end-start)
    
    # read song data file
    start = time.time()
    df = spark.read.json(song_data) # may not need this new schema
    end = time.time()
    print('read song data file runtime (s):', end-start)

    # extract columns to create songs table
    start = time.time()
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct()
    end = time.time()
    print('extract columns to create songs table runtime (s):', end-start)

    # write songs table to parquet files partitioned by year and artist
    start = time.time()
    songs_table_parquet = df.select("song_id", "title", "artist_id", "year", "duration", col("artist_name").alias("artist")).distinct()
    songs_table_parquet.write.mode('overwrite').partitionBy("year", "artist_id").parquet(output_data + "songs/")
    end = time.time()
    print('write songs table to parquet files runtime (s):', end-start)

    # extract columns to create artists table
    start = time.time()
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude").distinct()
    end = time.time()
    print('extract columns to create artists table runtime (s):', end-start)

    # write artists table to parquet files
    start = time.time()
    artists_table_parquet = artists_table.write.mode('overwrite').parquet(output_data + "artists/")
    end = time.time()
    print('write artists table to parquet files runtime (s):', end-start)

In [None]:
def process_log_data(spark, input_data, output_data):
   '''
    This function reads the data from S3, processes the log data using Spark,
    and writes the processed data back to S3.
   '''
    # get filepath to log data file
    start = time.time()
    log_data = input_data + 'log_data/*/*/*.json'
    end = time.time()
    print('get log filepath runtime (s):', end-start)

    # read log data file
    start = time.time()
    df = spark.read.json(log_data)
    end = time.time()
    print('read log data file runtime (s):', end-start)

    # filter by actions for song plays
    df = df.filter(df.page=='NextSong')

    # extract columns for users table    
    start = time.time()
    artists_table = df.select("user_id", "first_name", "last_name", "gender", "level").distinct()
    end = time.time()
    print('extract columns for users table runtime (s):', end-start)
    
    # write users table to parquet files
    start = time.time()
    artists_table_parquet = df.select("user_id", "first_name", "lsat_name", "gender", "level").distinct()
    artists_table_parquet.write.mode('overwrite').partitionBy("year", "artist_id").parquet(output_data + "users/")
    end = time.time()
    print('write users table to parquet files runtime (s):', end-start)

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x/1000.0) ), T.TimestampType() )
    df = df.withColumn("timestamp", get_timestamp(df.ts)) # creates new column named timestamp, populates it with converted timestamp
    
    # create datetime column from original timestamp column
    get_datetime = F.udf(lambda x: datetime.fromtimestamp(x/1000.0) )
    df = df.withColumn("datetime", get_timestamp(df.ts))
    
    # extract columns to create time table
    time_table = 
    
    # write time table to parquet files partitioned by year and month
    time_table

    # read in song data to use for songplays table
    song_df = 

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = 

    # write songplays table to parquet files partitioned by year and month
    songplays_table

In [None]:
def main():
    '''
    Create Spark session, provide paths to input/output data, load songs/log data
    and create parquet tables (columnar format) with star schema DB.
    '''
    spark = create_spark_session()
    input_data = "s3a://udacity-dend/"
    output_data = "s3a://dend-sparkify-amiri/output_data"
    
    process_song_data(spark, input_data, output_data)    
    process_log_data(spark, input_data, output_data)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# SANDBOX AREA BELOW ############

In [None]:
'''
# This code is from the internet, and it works
from pyspark import SparkContext
sc = SparkContext(master = 'local')
create_spark_session()
'''

In [None]:
# Works! Stops the spark context
SparkSession.stop()

In [None]:
# !pip install boto3
# Also had to install Java SDK

In [None]:
# Link to udacity bucket with the console:  https://s3.console.aws.amazon.com/s3/buckets/udacity-dend/?region=us-west-2
#s3path = 'https://s3.console.aws.amazon.com/s3/buckets/udacity-dend/?region=us-west-2#'
#s3pathsong = 'https://s3.console.aws.amazon.com/s3/buckets/udacity-dend/song_data/A/?region=us-west-2'
