## QUIZ

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import desc, asc, sum as Fsum

In [2]:
import datetime
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
spark = SparkSession.builder.appName("Quiz on Wrangling data").getOrCreate()

In [4]:
path = "file:/home/workspace/data/sparkify_log_small.json"
user_log = spark.read.json(path)

### QUESTION 1
`Which page did user id "" (empty string) NOT visit?`

In [5]:
blank_pages = user_log.select("page").where("userId = ''").dropDuplicates()
all_pages = user_log.select("page").dropDuplicates()

print("Pages UserId "" did not visit: \n")
for row in (set(all_pages.collect()) - set(blank_pages.collect())):
    print(row.page)

Pages UserId  did not visit: 

NextSong
Error
Save Settings
Downgrade
Upgrade
Settings
Logout
Submit Upgrade
Submit Downgrade


In [6]:
blank_pages.show()

+-----+
| page|
+-----+
| Home|
|About|
|Login|
| Help|
+-----+



In [7]:
user_log.take(1)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=1513720872284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046')]

### QUESTION 2
`How many female users do we have in the data set?`

In [8]:
# First method
user_log.select("userId").where("gender = 'F'").dropDuplicates().count()

462

In [9]:
user_log.filter(user_log.gender == 'F') \
    .select('userId', 'gender') \
    .dropDuplicates() \
    .count()

462

### QUESTION 3
`How many songs were played from the most played artist?`

In [10]:
# First method
user_log.filter(user_log.artist != "") \
        .groupBy("artist") \
        .count() \
        .orderBy("count", ascending=0) \
        .show(1)

+--------+-----+
|  artist|count|
+--------+-----+
|Coldplay|   83|
+--------+-----+
only showing top 1 row



In [11]:
# second method
user_log.filter(user_log.page == 'NextSong') \
    .select('artist') \
    .groupBy('artist') \
    .agg({'artist':'count'}) \
    .withColumnRenamed('count(artist)', 'artist_count') \
    .orderBy("artist_count", ascending=0) \
    .show(1)

+--------+------------+
|  artist|artist_count|
+--------+------------+
|Coldplay|          83|
+--------+------------+
only showing top 1 row



### QUESTION 4
`How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.`

In [12]:
from pyspark.sql.window import Window

In [13]:
function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())

user_window = Window \
    .partitionBy("userId") \
    .orderBy(desc("ts")) \
    .rangeBetween(Window.unboundedPreceding, Window.currentRow)

cumsum = user_log.filter((user_log.page == 'NextSong') | (user_log.page == 'Home')) \
                .select('userID', 'page', 'ts') \
                .withColumn('homevisit', function("page")) \
                .withColumn('period', Fsum('homevisit').over(user_window)) \

cumsum.filter((cumsum.page == 'NextSong')) \
     .groupBy('userID', 'period') \
     .agg({'period':'count'}) \
     .agg({'count(period)':'avg'}) \
     .withColumnRenamed('avg(count(period))', "average_song_played_after_visiting_home_page_per_user").show()

+-----------------------------------------------------+
|average_song_played_after_visiting_home_page_per_user|
+-----------------------------------------------------+
|                                    6.898347107438017|
+-----------------------------------------------------+



In [14]:
# # To use SQL, we create a temp view
# user_log.createTempView("user_log_view")

# spark.sql("""
# SELECT COUNT(songs)
# FROM user_log_view
# WHERE userId IS NOT NULL AND page = "Home"
# """);

In [15]:
# df.filter((df.page == 'NextSong') | (df.page == 'Home')) \
#     .select('userID', 'page', 'ts') \
#     .withColumn('homevisit', function(col('page'))) \
#     .withColumn('period', Fsum("homevisit").over())