# Data Wrangling with DataFrames Coding Quiz

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import udf

In [2]:
spark = SparkSession\
        .builder\
        .appName("data wrangling quiz")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/12 01:26:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
DATA_PATH = '../../data/sparkify_log_small.json'
user_log = spark.read.json(DATA_PATH)



## Question 1
### Which page did user id "" (empty string) NOT visit?

First we will see all the posibilities that a user had to visit a page

In [10]:
pages_to_visit = user_log.select('page').dropDuplicates()
pages_to_visit.show()

+----------------+
|            page|
+----------------+
|Submit Downgrade|
|            Home|
|       Downgrade|
|          Logout|
|   Save Settings|
|           About|
|        Settings|
|           Login|
|        NextSong|
|            Help|
|         Upgrade|
|           Error|
|  Submit Upgrade|
+----------------+



Then, let's see which pages do the _empty user_ visited.

In [16]:
# show pages that the empty user visited
empty_user_visited_pages = set(
    user_log.filter(user_log.userId == '').select(['userId','page']).dropDuplicates().toPandas().page.unique()
)
empty_user_visited_pages

{'About', 'Help', 'Home', 'Login'}

Then the pages not visited by the empty user are:


In [17]:
pages_list = set(pages_to_visit.toPandas().page.unique())
# pages the empty user did not visit
pages_list - empty_user_visited_pages

{'Downgrade',
 'Error',
 'Logout',
 'NextSong',
 'Save Settings',
 'Settings',
 'Submit Downgrade',
 'Submit Upgrade',
 'Upgrade'}

## Question 2 - Reflect 
### What type of user does the empty string user id most likely refer to?

In [19]:
user_log.filter(user_log.userId == '').head(5)

[Row(artist=None, auth='Logged Out', firstName=None, gender=None, itemInSession=0, lastName=None, length=None, level='free', location=None, method='PUT', page='Login', registration=None, sessionId=5598, song=None, status=307, ts=1513721196284, userAgent=None, userId=''),
 Row(artist=None, auth='Logged Out', firstName=None, gender=None, itemInSession=26, lastName=None, length=None, level='paid', location=None, method='GET', page='Home', registration=None, sessionId=428, song=None, status=200, ts=1513721274284, userAgent=None, userId=''),
 Row(artist=None, auth='Logged Out', firstName=None, gender=None, itemInSession=5, lastName=None, length=None, level='free', location=None, method='GET', page='Home', registration=None, sessionId=2941, song=None, status=200, ts=1513722009284, userAgent=None, userId=''),
 Row(artist=None, auth='Logged Out', firstName=None, gender=None, itemInSession=5, lastName=None, length=None, level='paid', location=None, method='GET', page='Home', registration=None, 

## Question 3
### How many female users do we have in the data set?

In [25]:
user_log.filter(user_log.gender == 'F').dropDuplicates(subset = ['userId']).count()

462

## Question 4
### How many songs were played from the most played artist?

First lets see who is the most played artist

In [33]:
# every row of the next df, means that the user played a song
had_play_songs = user_log.filter(user_log.page == 'NextSong')
# compute the times an artist was played
had_play_songs.groupBy('artist').count().orderBy('count', ascending = False).show()

+--------------------+-----+
|              artist|count|
+--------------------+-----+
|            Coldplay|   83|
|       Kings Of Leon|   69|
|Florence + The Ma...|   52|
|            BjÃÂ¶rk|   46|
|       Dwight Yoakam|   45|
|       Justin Bieber|   43|
|      The Black Keys|   40|
|         OneRepublic|   37|
|        Jack Johnson|   36|
|                Muse|   36|
|           Radiohead|   31|
|        Taylor Swift|   29|
|          Lily Allen|   28|
|Barry Tuckwell/Ac...|   28|
|               Train|   28|
|           Daft Punk|   27|
|          Nickelback|   27|
|           Metallica|   27|
|          Kanye West|   26|
|          John Mayer|   24|
+--------------------+-----+
only showing top 20 rows



Thus, **Coldplay** was the most played artist, with **83 reproductions**.

## Question 5 (challenge)
### How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.


In [87]:
from pyspark.sql import Window
from pyspark.sql.functions import desc, asc, count

In [85]:
home_and_reproduced_pages = user_log.filter(user_log.page.isin(['Home', 'NextSong']))

In [113]:
# Use a window function and cumulative sum to distinguish as either pre or post played song and home events.
windowval = Window.partitionBy("page").orderBy(desc("ts")).rangeBetween(Window.unboundedPreceding, 0)

In [127]:
# create a function to flag the home event
visited_home = udf(lambda x : 1 if x == 'Home' else 0, IntegerType())
#reproduced_song = udf(lambda x : 2 if x == 'NextSong' else 0, IntegerType())
# añade indicadoras de visita a HOme y reproducción de cnación
home_and_reproduced_pages = (
    home_and_reproduced_pages
    .select(['userId', 'page', 'visit_home','ts'])
    .withColumn("visit_home", visited_home('page'))
    .withColumn("visit_home_counter", Fsum("visit_home").over(windowval))
)

In [132]:
home_and_reproduced_pages.filter((home_and_reproduced_pages.page == 'NextSong')) \
    .groupBy('userID', 'visit_home_counter') \
    .agg({'visit_home_counter':'count'}) \
    .agg({'count(visit_home_counter)' : 'avg'})\
    .show()


+------------------------------+
|avg(count(visit_home_counter))|
+------------------------------+
|              9.87810650887574|
+------------------------------+



Thus, in average the users play 10 songs between visiting home page