In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType, TimestampType
import boto3
import pandas as pd
import gc
!pip install s3fs

Collecting s3fs
  Downloading https://files.pythonhosted.org/packages/72/5c/ec84c7ec49fde2c3b0d885ecae4504fa40fc77fef7684e9f2939c50f9b94/s3fs-0.4.0-py3-none-any.whl
Collecting boto3>=1.9.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/d5/57/e9675a5a8d0ee586594ff19cb9a601334fbf24fa2fb29052d2a900ee5d23/boto3-1.11.9-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 5.1MB/s ta 0:00:01
[?25hCollecting fsspec>=0.6.0 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/dd/1f/7028dacd3c28f34ce48130aae73a88fa5cc27b6b0e494fcf2739f7954d9d/fsspec-0.6.2-py3-none-any.whl (62kB)
[K    100% |████████████████████████████████| 71kB 16.1MB/s ta 0:00:01
[?25hCollecting botocore>=1.12.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/64/4c/b0b0d3b6f84a05f9135051b56d3eb8708012a289c4b82ee21c8c766f47b5/botocore-1.14.9-py2.py3-none-any.whl (5.9MB)
[K    100% |████████████████████████████████| 5.9MB 2.5MB/

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

In [5]:
s3 = boto3.client("s3", region_name="us-west-2", aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])

In [6]:
searchobj = "log_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [7]:
log_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        log_data_list.append("s3a://udacity-dend/" + k["Key"])

In [8]:
searchobj = "song_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [9]:
song_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        song_data_list.append("s3a://udacity-dend/" + k["Key"])

In [10]:
#df = pd.read_json(log_data_list[0], lines=True)

In [11]:
%%time
sdf = spark.read.json(log_data_list)

CPU times: user 5.12 ms, sys: 4.39 ms, total: 9.51 ms
Wall time: 23.3 s


In [12]:
#print(sdf.count())
#assert df.shape[1] == len(sdf.columns)

In [13]:
#sdf.limit(5).toPandas()

In [14]:
sdf = sdf.filter(F.col("page")=="NextSong")

In [15]:
userdf = sdf.select(F.col("userId").alias("user_id"), 
                 F.col("firstname").alias("first_name"), 
                 F.col("lastname").alias("last_name"), 
                 "gender", "level").distinct()\
         .orderBy("userId")

In [16]:
#userdf.count()

In [17]:
#userdf.limit(5).toPandas()

In [18]:
%%time
userdf.write.mode("overwrite").parquet("s3a://christophndde4/user_table/")

CPU times: user 82.8 ms, sys: 24.5 ms, total: 107 ms
Wall time: 10min 39s


In [18]:
gc.collect()

161

In [19]:
sdf = sdf.withColumn("timestamp", F.expr("cast(ts / 1000 as timestamp)"))

In [20]:
sdf = sdf.withColumn("datetime", F.expr("cast(timestamp as date)"))

In [21]:
#sdf.limit(3).toPandas()

In [22]:
sdf.registerTempTable("sdftab")

In [23]:
tdf = spark.sql("""
    SELECT DISTINCT
    timestamp AS start_time,
    HOUR(timestamp) AS hour,
    DAY(timestamp) AS day,
    WEEKOFYEAR(timestamp) as week,
    MONTH(timestamp) as month,
    YEAR(timestamp) as year,
    DAYOFWEEK(timestamp) as weekday
    FROM
    sdftab
    ORDER BY 1
    """)

In [25]:
%%time
tdf.write.mode("overwrite").parquet("s3a://christophndde4/time_table/")

CPU times: user 168 ms, sys: 44.8 ms, total: 213 ms
Wall time: 22min 4s


In [24]:
gc.collect()

30

In [25]:
#tdf.count()

In [26]:
#tdf.printSchema()

In [27]:
#tdf.limit(5).toPandas()

In [28]:
%%time
songstage_df = spark.read.json(song_data_list)

CPU times: user 143 ms, sys: 30.5 ms, total: 174 ms
Wall time: 6min 28s


In [29]:
#songstage_df.count()

In [30]:
#songstage_df.printSchema()

In [31]:
songsdf = songstage_df.select("song_id", "title", "artist_id", "year", 
                              "duration")\
                      .orderBy(F.col("song_id"))

In [32]:
%%time
songsdf.write.mode("overwrite").parquet("s3a://christophndde4/song_table/")

CPU times: user 193 ms, sys: 52.2 ms, total: 245 ms
Wall time: 26min 31s


In [32]:
#songsdf.count()

In [33]:
#songsdf.printSchema()

In [34]:
#songsdf.limit(5).toPandas()

In [35]:
gc.collect()

107

In [36]:
artistdf = songstage_df.select("artist_id", 
                               F.col("artist_name").alias("name"),
                               F.col("artist_location").alias("location"),
                               F.col("artist_latitude").alias("latitude"),
                               F.col("artist_longitude").alias("longitude"))\
                       .distinct().orderBy(F.col("artist_id"))

In [31]:
%%time
artistdf.write.mode("overwrite").parquet("s3a://christophndde4/artist_table/")

CPU times: user 270 ms, sys: 54 ms, total: 324 ms
Wall time: 24min 28s


In [37]:
#artistdf.count()

In [38]:
#artistdf.printSchema()

In [39]:
#artistdf.limit(5).toPandas()

In [71]:
gc.collect()

815

In [42]:
songstage_df.limit(3).toPandas()
sdf.limit(3).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,timestamp,datetime
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26.796,2018-11-15
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21.796,2018-11-15
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:45:41.796,2018-11-15


In [46]:
sdf = sdf.withColumn("songplay_id", F.monotonically_increasing_id())

In [82]:
sdf.createOrReplaceTempView("sdftab")
artistdf.createOrReplaceTempView("artisttab")
songsdf.createOrReplaceTempView("songstab")

In [83]:
songplay_df = spark.sql("""
    SELECT 
    s.songplay_id,
    s.timestamp AS start_time,
    s.userId as user_id,
    s.level,
    a.artist_id,
    a.name,
    s.artist
    FROM
    sdftab s
    LEFT JOIN artisttab a
    ON a.name = s.artist
    LEFT JOIN songstab so
    ON so.artist_id = a.artist_id
    AND so.title = s.song
    LIMIT 5""")

In [84]:
songplay_df.toPandas()

KeyboardInterrupt: 