# Testing Notebook 02
## See last few cells to see why I cannot save to parquet

## Extract AWS keys from config file & store into environment variables

In [None]:
import configparser
import os

config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"] = config.get('AWS','KEY')
os.environ["AWS_SECRET_ACCESS_KEY"] = config.get('AWS','SECRET')

## Create spark session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

## Load schema from S3

Access the public s3 bucket from http via: https://s3.console.aws.amazon.com/s3/buckets/udacity-dend/

In [3]:
# Declare S3 locations
s3_song = "s3://udacity-dend/song_data"
s3_log = "s3://udacity-dend/log_data"
input_data = "s3a://udacity-dend/"
output_data = "s3://udacity-dend-project-output-1995/"

In [4]:
# Create filepath for log data
log_data_folder = os.path.join(input_data, "log-data/")
log_files = "{}*/*/*events.json".format(log_data_folder)
log_files

's3a://udacity-dend/log-data/*/*/*events.json'

In [5]:
# Create filepath for song data
song_data_folder = os.path.join(input_data, "song-data/")
song_files = "{}*/*/*/*.json".format(song_data_folder)
song_files

's3a://udacity-dend/song-data/*/*/*/*.json'

In [6]:
# read song data file. NTS we have a lot of song data. 
# So only select a small amount for testing purposes.
song_files = "{}A/A/A/*.json".format(song_data_folder)
print("Song files: ", song_files)
df_song = spark.read.json(song_files).dropDuplicates().cache()

Song files:  s3a://udacity-dend/song-data/A/A/A/*.json


In [7]:
# read log data file
df_log = spark.read.json(log_files).dropDuplicates().cache()

In [8]:
print("Song count: ", df_song.count())
print("Log count: ", df_log.count())

Song count:  24
Log count:  8056


In [None]:
# View the df schemas
df_log.printSchema()

In [None]:
# View the df schemas
df_song.printSchema()

### NTS: Import whole data using the above schema

## Recall the flow of data
- Table: time
    - start_time - ts from df_log
    - hour - transform from start_time
    - day - transform from start_time
    - week - transform from start_time
    - month - transform from start_time
    - year - transform from start_time
- Table: users (filter by nextSong)
    - user_id - userId from df_log
    - first_name - firstName from df_logs
    - last_name - lastName from df_logs 
    - gender - gender from df_logs
    - level - level from df_logs
- Table: artists
    - artist_id - artist_id from df_song
    - name - artist_name from df_song
    - location - artist_location from df_song
    - lattitude - artist_latitude from df_song
    - longitude - artst_longitude from df_song
- Table: songs
    - song_id - song_id from df_song
    - title - title from df_song
    - artist_id - artist_id from df_song
    - year - year from df_song
    - duration - duration from df_song
- Table: songplays
    - songplay_id -
    - start_time -
    - user_id -
    - level -
    - song_id -
    - artist_id - 
    - session_id - 
    - location - 
    - user_agent - 

## Table: time

In [9]:
df_log.select("ts").dropDuplicates().sort("ts").show(5)

+-------------+
|           ts|
+-------------+
|1541105830796|
|1541106106796|
|1541106132796|
|1541106352796|
|1541106496796|
+-------------+
only showing top 5 rows



In [10]:
# Test a function for parsing to datetime
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.types import TimestampType

# Input string is of type long. But since long doesn't exist in Python, use int.
sample_time_string_1 = int(1542296032796)
sample_time_string_2 = int(1541106496796)  

def convert_to_datetime(text):
    
    obj = datetime.fromtimestamp(text / 1000)
    return obj

datetime_obj = (convert_to_datetime(sample_time_string_1))
print(datetime_obj)

2018-11-15 15:33:52.796000


In [11]:
# Obtain the time stamp column
df_time = df_log.select("ts")
get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
df_time = df_time.withColumn("start_time", get_timestamp("ts"))
df_time = df_time.drop("ts")
df_time.printSchema()

root
 |-- start_time: timestamp (nullable = true)



In [12]:
# Get the other columns: hour, day, week, month, year, weekday
from pyspark.sql.functions import hour, dayofmonth, month, year, weekofyear, dayofweek

df_time = df_time.withColumn("hour", hour("start_time"))
df_time = df_time.withColumn("day", dayofmonth("start_time"))
df_time = df_time.withColumn("month", month("start_time"))
df_time = df_time.withColumn("year", year("start_time"))
df_time = df_time.withColumn("week", weekofyear("start_time"))
df_time = df_time.withColumn("weekday", dayofweek("start_time"))

df_time.printSchema()
df_time.show(5)

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)

+--------------------+----+---+-----+----+----+-------+
|          start_time|hour|day|month|year|week|weekday|
+--------------------+----+---+-----+----+----+-------+
|2018-11-15 15:33:...|  15| 15|   11|2018|  46|      5|
|2018-11-15 16:23:...|  16| 15|   11|2018|  46|      5|
|2018-11-15 21:45:...|  21| 15|   11|2018|  46|      5|
|2018-11-15 22:32:...|  22| 15|   11|2018|  46|      5|
|2018-11-21 02:42:...|   2| 21|   11|2018|  47|      4|
+--------------------+----+---+-----+----+----+-------+
only showing top 5 rows



In [13]:
# Get distinct rows only
from pyspark.sql.functions import col

time_table = df_time.select(col("start_time"), col("hour"), col("day"), col("week"), \
                           col("month"), col("year"), col("weekday")).distinct()

In [15]:
# Create output path for parquet time file
path = os.path.join(output_data, 'time')
path

's3://udacity-dend-project-output-1995/time'

In [16]:
time_table.write.parquet("s3a://udacity-dend-project-output-1995/time")

Py4JJavaError: An error occurred while calling o103.parquet.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 400, AWS Service: Amazon S3, AWS Request ID: 57B90AED0273DA50, AWS Error Code: null, AWS Error Message: Bad Request, S3 Extended Request ID: JsgulVVuhNetRzbFkPYLi+VZgnPn2H6nA81FSFcX8izaimFxtrpOiAY++qQPkS3LN8288ytnuJE=
	at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
	at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
	at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1031)
	at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:994)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:297)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWritingFileFormat(DataSource.scala:424)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWriting(DataSource.scala:524)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:566)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [39]:
# spark.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-2.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])

## Test writing output file to my own bucket
Access my s3 bucket from http via: https://s3.console.aws.amazon.com/s3/buckets/udacity-dend-project-output-1995/

In [60]:
import boto3

s3 = boto3.resource(
    's3',
    region_name='us-east-2',
    aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
)

content="String content to write to a new S3 file"
s3.Object('udacity-dend-project-output-1995', 'newfile.txt').put(Body=content)

{'ResponseMetadata': {'RequestId': '8E116C0B02D2ABC4',
  'HostId': '7992db/s2B+SPF+4ILkUrZj3Pb+JH5BEMF2TgPBTSS+r5w/NsayqZYLEqfBSyINpeWjRWYI0/8JW4m1KnB2McA==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '7992db/s2B+SPF+4ILkUrZj3Pb+JH5BEMF2TgPBTSS+r5w/NsayqZYLEqfBSyINpeWjRWYI0/8JW4m1KnB2McA==',
   'x-amz-request-id': '8E116C0B02D2ABC4',
   'date': 'Thu, 21 Jan 2021 05:39:13 GMT',
   'etag': '"4a4ba548fe7ddb965593f41a13e1df90"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"4a4ba548fe7ddb965593f41a13e1df90"'}