In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType, TimestampType
import boto3
import pandas as pd
import gc
!pip install s3fs

Collecting s3fs
  Downloading https://files.pythonhosted.org/packages/72/5c/ec84c7ec49fde2c3b0d885ecae4504fa40fc77fef7684e9f2939c50f9b94/s3fs-0.4.0-py3-none-any.whl
Collecting boto3>=1.9.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/b5/3595b837d2aaf45b93adb8db44bb4ed07c04b3ce9ff6c399350314c779d2/boto3-1.11.8-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 3.6MB/s ta 0:00:01
[?25hCollecting botocore>=1.12.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/61/f3/f06005f90a09bbdd4bc6df76400f0ac279f7e1f556d635ab60fb1f916d1b/botocore-1.14.8-py2.py3-none-any.whl (5.9MB)
[K    100% |████████████████████████████████| 5.9MB 2.4MB/s eta 0:00:01    27% |████████▉                       | 1.6MB 30.4MB/s eta 0:00:01    46% |███████████████                 | 2.8MB 25.5MB/s eta 0:00:01    81% |██████████████████████████      | 4.8MB 19.2MB/s eta 0:00:01
[?25hCollecting fsspec>=0.6.0 (from s3fs)
[?25l  Downloa

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
s3 = boto3.client("s3", region_name="us-west-2", aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])

In [6]:
searchobj = "log_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [7]:
log_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        log_data_list.append("s3a://udacity-dend/" + k["Key"])

In [8]:
searchobj = "song_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [9]:
song_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        song_data_list.append("s3a://udacity-dend/" + k["Key"])

In [10]:
#df = pd.read_json(log_data_list[0], lines=True)

In [11]:
%%time
sdf = spark.read.json(log_data_list)

CPU times: user 11.4 ms, sys: 614 µs, total: 12 ms
Wall time: 22.6 s


In [12]:
#print(sdf.count())
#assert df.shape[1] == len(sdf.columns)

In [13]:
#sdf.limit(5).toPandas()

In [14]:
sdf = sdf.filter(F.col("page")=="NextSong")

In [15]:
userdf = sdf.select(F.col("userId").alias("user_id"), 
                 F.col("firstname").alias("first_name"), 
                 F.col("lastname").alias("last_name"), 
                 "gender", "level").distinct()\
         .orderBy("userId")

In [16]:
#userdf.count()

In [17]:
#userdf.limit(5).toPandas()

In [18]:
%%time
userdf.write.mode("overwrite").parquet("s3a://christophndde4/user_table/")

CPU times: user 82.8 ms, sys: 24.5 ms, total: 107 ms
Wall time: 10min 39s


In [17]:
gc.collect()

161

In [18]:
sdf = sdf.withColumn("timestamp", F.expr("cast(ts / 1000 as timestamp)"))

In [19]:
sdf = sdf.withColumn("datetime", F.expr("cast(timestamp as date)"))

In [20]:
#sdf.limit(3).toPandas()

In [21]:
sdf.registerTempTable("sdftab")

In [22]:
tdf = spark.sql("""
    SELECT DISTINCT
    timestamp AS start_time,
    HOUR(timestamp) AS hour,
    DAY(timestamp) AS day,
    WEEKOFYEAR(timestamp) as week,
    MONTH(timestamp) as month,
    YEAR(timestamp) as year,
    DAYOFWEEK(timestamp) as weekday
    FROM
    sdftab
    ORDER BY 1
    """)

In [25]:
%%time
tdf.write.mode("overwrite").parquet("s3a://christophndde4/time_table/")

CPU times: user 168 ms, sys: 44.8 ms, total: 213 ms
Wall time: 22min 4s


In [23]:
gc.collect()

30

In [24]:
#tdf.count()

In [25]:
#tdf.printSchema()

In [26]:
#tdf.limit(5).toPandas()

In [27]:
%%time
songstage_df = spark.read.json(song_data_list)

CPU times: user 165 ms, sys: 19.8 ms, total: 184 ms
Wall time: 6min 16s


In [29]:
#songstage_df.count()

In [30]:
#songstage_df.printSchema()

In [28]:
songsdf = songstage_df.select("song_id", "title", "artist_id", "year", 
                              "duration")\
                      .orderBy(F.col("song_id"))

In [32]:
%%time
songsdf.write.mode("overwrite").parquet("s3a://christophndde4/song_table/")

CPU times: user 193 ms, sys: 52.2 ms, total: 245 ms
Wall time: 26min 31s


In [33]:
#songsdf.count()

In [34]:
#songsdf.printSchema()

In [35]:
#songsdf.limit(5).toPandas()

In [29]:
gc.collect()

107

In [30]:
artistdf = songstage_df.select("artist_id", 
                               F.col("artist_name").alias("name"),
                               F.col("artist_location").alias("location"),
                               F.col("artist_latitude").alias("latitude"),
                               F.col("artist_longitude").alias("longitude"))\
                       .distinct().orderBy(F.col("artist_id"))

In [31]:
%%time
artistdf.write.mode("overwrite").parquet("s3a://christophndde4/artist_table/")

CPU times: user 270 ms, sys: 54 ms, total: 324 ms
Wall time: 24min 28s


In [None]:
#artistdf.count()

In [None]:
#artistdf.printSchema()

In [None]:
#artistdf.limit(5).toPandas()

In [33]:
gc.collect()

0