In [72]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType, TimestampType
import boto3
import pandas as pd
!pip install s3fs



In [57]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [58]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [59]:
spark = create_spark_session()

In [60]:
s3 = boto3.client("s3", region_name="us-west-2", aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])

In [61]:
searchobj = "log_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [62]:
log_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        log_data_list.append("s3a://udacity-dend/" + k["Key"])

In [141]:
searchobj = "song_data"
lcobj = list(s3.list_objects_v2(Bucket="udacity-dend", 
                                Prefix=searchobj + "/").values())

In [142]:
song_data_list = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        song_data_list.append("s3a://udacity-dend/" + k["Key"])

In [63]:
df = pd.read_json(log_data_list[0], lines=True)

In [100]:
sdf = spark.read.json(log_data_list)

In [101]:
print(sdf.count())
assert df.shape[1] == len(sdf.columns)

8056


In [102]:
sdf.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [103]:
sdf = sdf.filter(F.col("page")=="NextSong")

In [104]:
#userdf = sdf.select(col("userId").alias("user_id"), 
#                 col("firstname").alias("first_name"), 
#                 col("lastname").alias("last_name"), 
#                 "gender", "level").distinct()\
#         .orderBy("userId")

In [105]:
#userdf.count()

104

In [106]:
#userdf.limit(5).toPandas()

Unnamed: 0,user_id,first_name,last_name,gender,level
0,10,Sylvie,Cruz,F,free
1,100,Adler,Barrera,M,free
2,101,Jayden,Fox,M,free
3,11,Christian,Porter,F,free
4,12,Austin,Rosales,M,free


In [108]:
#%%time
#userdf.write.mode("overwrite").parquet("s3a://christophndde4/user_table/")

In [109]:
sdf = sdf.withColumn("timestamp", F.expr("cast(ts / 1000 as timestamp)"))

In [114]:
sdf = sdf.withColumn("datetime", F.expr("cast(timestamp as date)"))

In [115]:
sdf.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- datetime: date (nullable = true)



In [117]:
sdf.limit(3).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,timestamp,datetime
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26.796,2018-11-15
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21.796,2018-11-15
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:45:41.796,2018-11-15


In [125]:
sdf.registerTempTable("sdftab")

In [136]:
tdf = spark.sql("""
    SELECT DISTINCT
    timestamp AS start_time,
    HOUR(timestamp) AS hour,
    DAY(timestamp) AS day,
    WEEKOFYEAR(timestamp) as week,
    MONTH(timestamp) as month,
    YEAR(timestamp) as year,
    DAYOFWEEK(timestamp) as weekday
    FROM
    sdftab
    ORDER BY 1
    """)

In [137]:
tdf.count()

6813

In [138]:
tdf.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)



In [139]:
tdf.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-01 21:01:46.796,21,1,44,11,2018,5
1,2018-11-01 21:05:52.796,21,1,44,11,2018,5
2,2018-11-01 21:08:16.796,21,1,44,11,2018,5
3,2018-11-01 21:11:13.796,21,1,44,11,2018,5
4,2018-11-01 21:17:33.796,21,1,44,11,2018,5


In [140]:
sdf.limit(3).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,timestamp,datetime
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26.796,2018-11-15
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21.796,2018-11-15
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:45:41.796,2018-11-15


In [None]:
songplay_df = sdf.select("")

In [144]:
%%time
songstage_df = spark.read.json(song_data_list)

In [160]:
len(song_data_list)

999

In [148]:
songstage_df.count()

999

In [147]:
songstage_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [146]:
songstage_df.limit(5).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARSUVLW12454A4C8B8,35.83073,Tennessee,-85.97874,Royal Philharmonic Orchestra/Sir Thomas Beecham,94.56281,1,SOBTCUI12A8AE48B70,Faust: Ballet Music (1959 Digital Remaster): V...,0
1,ARA04401187B991E6E,54.99241,"Londonderry, Northern Ireland",-7.31923,JOSEF LOCKE & ORCHESTRA,184.16281,1,SOXKFTF12A6D4FBF31,Isle Of Innisfree (Film 'Quiet Man') (1992 Dig...,0
2,ARXQC081187FB4AD42,54.31407,UK,-2.23001,William Shatner_ David Itkin_ The Arkansas Sym...,1047.71873,1,SOXRPUH12AB017F769,Exodus: Part I: Moses and Pharaoh,0
3,ARWUNH81187FB4A3E0,,"Miami , Florida",,Trick Daddy,227.10812,1,SOVNKJI12A8C13CB0D,Take It To Da House (Featuring The Slip N' Sli...,2001
4,ARNU0OM1187FB3F14A,32.77815,"Dallas, TX",-96.7954,Larry Groce/Disneyland Children's Sing-Along C...,90.04363,1,SOPEJZP12A8C1369E6,He's Got The Whole World In His Hands,0


In [158]:
songsdf = songstage_df.select("song_id", "title", "artist_id", "year", 
                              "duration")\
                      .orderBy(F.col("song_id"))

In [159]:
songsdf.count()

999

In [151]:
songsdf.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: long (nullable = true)
 |-- duration: double (nullable = true)



In [152]:
songsdf.limit(5).toPandas()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOAADAD12A8C13D5B0,One Shot (Album Version),ARQTC851187B9B03AF,2005,263.99302
1,SOABCEU12A8C132027,Cold Waste,ARL6NP61187B98C1FC,2007,385.43628
2,SOABNPC12A8C13A9CC,Après Le Show,ARFM1EQ1187FB533ED,2005,223.4771
3,SOABWAP12A8C13F82A,Take Time,AR5LMPY1187FB573FE,1978,258.89914
4,SOABYIT12AB0183026,Vilda vindar,AR98ZSW1187B98E82C,1985,266.13506


In [154]:
artistdf = songstage_df.select("artist_id", 
                               F.col("artist_name").alias("name"),
                               F.col("artist_location").alias("location"),
                               F.col("artist_latitude").alias("latitude"),
                               F.col("artist_longitude").alias("longitude"))\
                       .distinct().orderBy(F.col("artist_id"))

In [155]:
artistdf.count()

967

In [156]:
artistdf.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [157]:
artistdf.limit(5).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,AR00Y9I1187B999412,Akercocke,,,
1,AR065TW1187FB4C3A5,Tricky / The Mad Dog Reflex,"Knowle West, Bristol, Avon, Engla",51.43558,-2.57518
2,AR06EB01187FB40150,NOFX,"Berkeley, CA",,
3,AR06XSY1187B9B279E,Little River Band,"Melbourne, Australia",,
4,AR08LXJ1187B9995A4,Tungtvann,,,


In [161]:
input_data = "s3a://udacity-dend/"

In [168]:
bucketname = input_data[6: ]
bucketname = bucketname[0:bucketname.find("/")]

lcobj = list(s3.list_objects_v2(Bucket=bucketname, 
                            Prefix="log_data/").values())

In [171]:
log_data = []
for k in lcobj[2]:
    if k["Key"].find(".json") > -1:
        log_data.append("s3a://" + bucketname + "/" + k["Key"])

In [170]:
log_data

['s3a://udacity-dendlog_data/2018/11/2018-11-01-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-02-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-03-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-04-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-05-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-06-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-07-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-08-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-09-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-10-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-11-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-12-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-13-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-14-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-15-events.json',
 's3a://udacity-dendlog_data/2018/11/2018-11-16-events.json',
 's3a://