### Test notebook for prototyping

**Name: Darren Foley**

**Email: darren.foley@ucdconnect.ie**

In [1]:
import os
import configparser
from datetime import datetime
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
!pip install findspark

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/fc/2d/2e39f9a023479ea798eed4351cd66f163ce61e00c717e03c37109f00c0f2/findspark-1.4.2-py2.py3-none-any.whl
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [3]:
import findspark
findspark.init()

In [4]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config.get('AWS', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('AWS', 'AWS_SECRET_ACCESS_KEY')

AWS_ACCESS_KEY=config.get('AWS','AWS_ACCESS_KEY_ID')
AWS_SECRET_KEY=config.get('AWS','AWS_SECRET_ACCESS_KEY')

#!echo $AWS_ACCESS_KEY_ID
#!echo $AWS_SECRET_ACCESS_KEY

In [5]:

conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0')
sc = SparkContext(conf=conf)

# add aws credentials
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

#creating the context
sqlContext = SQLContext(sc)

In [6]:
song_schema = StructType([StructField("artist_id",StringType(),True), \
                         StructField("artist_latitude",DoubleType(),True), \
                         StructField("artist_location",StringType(),True), \
                         StructField("artist_longitude",DoubleType(),True), \
                         StructField("artist_name",StringType(),True), \
                         StructField("duration",DoubleType(),True), \
                         StructField("num_songs",LongType(),True), \
                         StructField("song_id",StringType(),True), \
                         StructField("title",StringType(),True), \
                         StructField("year",LongType(),True)])

In [7]:
#reading the first csv file and store it in an RDD
song_data = sqlContext.read.json("s3a://udacity-dend/song_data/*/*/*/*.json", schema = song_schema)
#song_data = sqlContext.read.json("s3a://udacity-dend/song_data/A/A/A/*.json")

In [8]:
song_data.limit(5).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,233.22077,1,SOVIYJY12AF72A4B00,The Dead Next Door (Digitally Remastered 99),1983
1,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,287.92118,1,SOVYXYL12AF72A3373,Rebel Yell (1999 Digital Remaster),1983
2,ARQ846I1187B9A7083,,,,Yvonne S. Moriarty / Walt Fowler / Ladd McInto...,196.04853,1,SOEPTVC12A67ADD0DA,"To Zucchabar [""Gladiator"" - Music from the Mot...",0
3,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,Billy Idol,247.53587,1,SOLQYSZ12AB0181F97,Mony Mony (Live),1987
4,AR3TZ691187FB3DBB1,,,,Russell Watson / Pino Palladino / Robbie McInt...,273.44934,1,SOVPFJK12A6701CB16,Barcelona - (Friends until the end),2000


In [9]:
song_data.count()

14896

In [9]:
song_data.printSchema()

73

#### Extract to songs table

In [8]:
#song_id, title, artist_id, year, duration
#rdd.select(["song_id","title","artist_id","year","duration"]).limit(5).toPandas()
songs = song_data.select(["song_id","title","artist_id","year","duration"]).distinct()
#songs.limit(5).toPandas()
songs.count()

73

#### Extract to artist table

In [9]:
#Artists table
#artist_id, name, location, lattitude, longitude
artist = song_data.select(["artist_id","artist_name","artist_location","artist_latitude","artist_longitude"]).distinct()
#artist.limit(5).toPandas()
artist.select(["artist_name"]).count()

73

#### Partitioning

Songs table files are partitioned by year and then artist. - Songs(year,Artist)

Time table files are partitioned by year and month. - Time(year, month)

Songplays table files are partitioned by year and month. - songPlays(year, month)

In [15]:
songs.write.format("parquet").partitionBy("year", "artist_id").mode("overwrite").save("tmp/parquet_test.parquet")
artist.write.format("parquet").partitionBy("artist_name").mode("overwrite").save("tmp/parquet_test.parquet")

#### Writing parquet to S3 bucket in us-west-2

In [19]:
artist.write.format("parquet").partitionBy("artist_name").mode("overwrite").parquet("s3a://sparkify-data-lake-df/test/artist")

### Processing log data

s3a://udacity-dend/log_data/


Sample path: s3a://udacity-dend/log_data/2018/11/2018-11-13-events.json


In [10]:
#log_data= sqlContext.read.json("s3a://udacity-dend/log_data/2018/11/2018-11-13-events.json")
log_data= sqlContext.read.json("s3a://udacity-dend/log_data/*/*/*.json")

In [11]:
log_data.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [12]:
log_data.count()

8056

In [8]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



#### SongPlay: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
#### Users: user_id, first_name, last_name, gender, level
#### Time: start_time, hour, day, week, month, year, weekday

In [12]:
# Users
users = log_data.select(["userId", "firstName", "lastName", "gender", "level"]).distinct()
users.limit(5).toPandas()

Unnamed: 0,userId,firstName,lastName,gender,level
0,80,Tegan,Levine,F,paid
1,15,Lily,Koch,F,paid
2,37,Jordan,Hicks,F,free
3,44,Aleena,Kirby,F,paid
4,50,Ava,Robinson,F,free


In [49]:
#start_time, hour, day, week, month, year, weekday
# Time
get_hour = udf(lambda x: x.hour)
get_day = udf(lambda x: x.day)
get_week = udf(lambda x: x.isocalendar()[1])
get_month = udf(lambda x: x.month)
get_year = udf(lambda x: x.year)
get_weekday = udf(lambda x: x.isoweekday())
to_timestamp = udf(lambda x: int(x.timestamp()*1000))
to_datetime = udf(lambda x : datetime.utcfromtimestamp(x/1000.0))

time_df = log_data.withColumn("ts_m", to_datetime("ts"))

In [None]:
time = time_df.select(["ts_m"]).withColumn("ts", to_timestamp(time_df.ts_m)) \
                        .withColumn("hour", get_hour(time_df.ts_m)) \
                        .withColumn("day", get_day(time_df.ts_m)) \
                        .withColumn("week", get_week(time_df.ts_m)) \
                        .withColumn("month", get_month(time_df.ts_m)) \
                        .withColumn("year", get_year(time_df.ts_m)) \
                        .withColumn("weekday", get_weekday(time_df.ts_m)) \
                        .select(["ts","hour","day","week","month","year","weekday"]) \
                        .distinct()

#time.filter(time.ts == 1542071549796).limit(1).toPandas()

In [13]:
# SongPlay data (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
songplay = log_data.select(["ts","userId","level","song","artist", "sessionId", "location", "userAgent"])
log_data.select(["ts","userId","level","song","artist", "sessionId", "location", "userAgent"]).limit(5).toPandas()

Unnamed: 0,ts,userId,level,song,artist,sessionId,location,userAgent
0,1542069417796,66,free,,,514,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
1,1542069637796,66,free,Ja I Ty,Fu,514,"Harrisburg-Carlisle, PA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
2,1542071524796,51,free,,,510,"Houston-The Woodlands-Sugar Land, TX","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK..."
3,1542071549796,51,free,A Party Song (The Walk of Shame),All Time Low,510,"Houston-The Woodlands-Sugar Land, TX","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK..."
4,1542079142796,9,free,Pop-Pop!,Nik & Jay,379,"Eureka-Arcata-Fortuna, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....


In [16]:
artist_only = artist.select(["artist_id","artist_name"]).distinct()
songs_only = songs.select(["song_id","title"]).distinct()
 
result = songplay.join(artist_only, artist_only.artist_name == songplay.artist, 'inner') \
                 .join(songs_only, songs_only.title == songplay.song, 'inner') \
                 .select(songplay.ts, songplay.userId, songplay.level, songs_only.song_id, artist_only.artist_id, songplay.sessionId, songplay.location, songplay.userAgent) \


In [17]:
result.limit(5).toPandas()

Unnamed: 0,ts,userId,level,song_id,artist_id,sessionId,location,userAgent
0,1542140035796,95,paid,SOUPKAB12AB0185DF9,AR12F2S1187FB56EEF,411,"Winston-Salem, NC","""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like..."
