In [None]:
import numpy as np
import pandas as pd
import json

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

Data is read in and filtered for a specific year. If you have downloaded your own streaming history then change the json file path.

In [None]:
data = pd.read_json('../data/ExampleData/StreamingHistory0.json')
data = data[data.endTime >= '2021-01-01']
data

# Can We Get A List Of Songs
A unique list of songs (artist name and track name).

In [None]:
all_songs = data[['artistName', 'trackName']].drop_duplicates().reset_index(drop=True)
all_songs

# Statistics Without API

## Which Artists Have You Listened To The Most?
Orders artists by the number of plays (regardless of play length). Artists are only counted if they are the main artist on the track.

In [None]:
data.groupby('artistName').size().to_frame().reset_index().rename({0 : 'count'}, axis=1).sort_values(by='count', ascending=False).head(20)

## Which Songs Have You Listened To The Most?
Orders songs by the number of plays (regardless of play length).

In [None]:
data.groupby(['artistName', 'trackName']).size().to_frame().reset_index().rename({0 : 'count'}, axis=1).sort_values(by='count', ascending=False).head(20)

## How Many Total Songs Have You Listened To?
Counts the number of songs played overall (regardless of play length).

In [None]:
data.shape[0]

## How Many Unique Songs Have You Listened To?
Counts the number of different songs played overall (regardless of play length).

In [None]:
len(list(data.groupby(['artistName', 'trackName']).groups.keys()))

## How Many Unique Artists Have You Listened To?
Counts the number of different artists played overall, where artists are only counted if they were a main artist on at least 1 track.

In [None]:
len(list(data.groupby(['artistName']).groups.keys()))

## How Long Have You Been Listening To Songs?
Sums the amount of time listening to tracks overall, and outputs that as a single string of days, hours, minutes, and seconds.

In [None]:
total_ms = data.msPlayed.sum()

total_s = total_ms // 1000
total_ms -= total_s * 1000

total_m = total_s // 60
total_s -= total_m * 60

total_h = total_m // 60
total_m -= total_h * 60

total_d = total_h // 24
total_h -= total_d * 24

time_string = ''
if total_d > 0:
    time_string += str(total_d) + ' Days, '
if total_h > 0:
    time_string += str(total_h) + ' Hours, '
if total_m > 0:
    time_string += str(total_m) + ' Minutes, '
if total_s > 0:
    time_string += str(total_s) + ' Seconds, '
time_string = time_string[:-2]

print(time_string)

## Which Days Did You Listen To Songs The Most?
Orders days of the year by amount of time listening to tracks.

In [None]:
data_days = data.copy()
data_days['day'] = data_days.endTime.apply(lambda x: x[:10])
data_days.groupby('day').msPlayed.sum().to_frame().reset_index().sort_values('msPlayed', ascending=False).head(10)

# Get Songs Through API
Obtain the Spotify developer keys from the key file, which allows use of the API.

In [None]:
def get_dev_keys():
    with open('../data/api_keys/api_dev_keys.txt', 'r') as f:
        keys = f.read().split('\n')
    return keys[0], keys[1]

In [None]:
SPOTIPY_CLIENT_ID, SPOTIPY_CLIENT_SECRET = get_dev_keys()

In [None]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Get Artists For Each Song
The method below will attempt to return the artists' details for a given track (where a track is a main artist name and track name). All main and featured artists will be returned.

In [None]:
def get_song_artists_through_api(artist, track):
    search_query = artist + ' ' + track
    song_details = sp.search(search_query, limit=1)
    song_artists = song_details['tracks']['items'][0]['artists']
    return song_artists

Each song is then iterated through to get a DataFrame of tuples containing main artist name, track name, and artist URI for each artist in each track. A set of artist URIs is also stored in the artist_uris variable.

In [None]:
all_songs_with_artists = []

for index, row in all_songs.iterrows():
    try:
        # Get song artists
        song_artists = get_song_artists_through_api(row.artistName, row.trackName)

        # Store artists for song
        uris = [artist['uri'] for artist in song_artists]
        for uri in uris:
            all_songs_with_artists.append([row.artistName, row.trackName, uri])
    except:
        if ' - ' in row.trackName:
            try:
                # Get song artists
                song_artists = get_song_artists_through_api(row.artistName, ' - '.join(row.trackName.split(' - ')[:-1]))

                # Store artists for song
                uris = [artist['uri'] for artist in song_artists]
                for uri in uris:
                    all_songs_with_artists.append([row.artistName, row.trackName, uri])
            except:
                print(index, row.artistName, row.trackName)
        else:
            print(index, row.artistName, row.trackName)

In [None]:
all_songs_with_artists = pd.DataFrame(all_songs_with_artists, columns=['artistName', 'trackName', 'artistURI'])

In [None]:
artist_uris = list(all_songs_with_artists.artistURI.unique())

## Get Artist Details
The Spotify artist details for each artist previously found is now obtained through the Spotify API. Artist info is retrieved in groups of 50, which is the maximum size through the API.

In [None]:
artist_search = {}
for i in range(0, len(artist_uris), 50):
    search_results = sp.artists(artist_uris[i:i + 50])
    for result in search_results['artists']:
        artist_search[result['uri']] = result

# Statistics With API

## Which Artists Have You Listened To The Most?

The method below will check if an artist appears in a given song.

In [None]:
def check_uri_for_song(uri, trackName, artistName):
    song_filter = (all_songs_with_artists.artistName==artistName) & (all_songs_with_artists.trackName==trackName)
    return uri in all_songs_with_artists[song_filter].artistURI.unique()

Using our data from the Spotify API, we can now check the total amount of time listening to each artist (including tracks where the artist is only featured on).

In [None]:
# Get play time for each artist
df_artists = data[['trackName', 'artistName', 'msPlayed']]
artist_play_times = []
for uri in list(artist_search.keys()):
    artist_play_time = df_artists[df_artists.apply(lambda x: check_uri_for_song(uri, x.trackName, x.artistName), axis=1)].msPlayed.sum()
    artist_play_times.append((uri, artist_play_time))

# Get top 5 artists
top_artists = pd.DataFrame(artist_play_times, columns=['URI', 'Ms'])
top_artists['Name'] = top_artists.URI.apply(lambda x: artist_search[x]['name'])
top_artists['Minutes'] = top_artists.Ms / 60000
top_artists

The top 20 artists are outputted below.

In [None]:
top_artists[['Name', 'Minutes']].sort_values('Minutes', ascending=False).head(20)

## Which Genres Have You Listened To The Most?

We can also find the most listened to genres. A song's genre is based upon the genres of the main artist, as Spotify does not store genres for each individual song.

Firstly, we will go through each artist to get a set of genres.

In [None]:
genres = []
for artist in artist_search:
    genres.extend(artist_search[artist]['genres'])
genres = list(set(genres))

We can then go through each song to get the play times for each genre.

In [None]:
# Set up genre time dict
genre_times = {}
for genre in genres:
    genre_times[genre] = 0

for index, row in data.iterrows():
    # Get main artist
    song_data = all_songs_with_artists[(all_songs_with_artists.trackName==row.trackName) & (all_songs_with_artists.artistName==row.artistName)]
    song_data = song_data.copy().reset_index()
    try:
        main_artist = song_data.iloc[0].artistURI
    except:
        main_artist = None
    
    # Add genres of main artist to counter
    if main_artist != None:
        artist_genres = artist_search[main_artist]['genres']
        for genre in artist_genres:
            genre_times[genre] += row.msPlayed

The top 20 genres are outputted below.

In [None]:
df_genre = pd.DataFrame(list(genre_times.items()), columns=['Genre', 'Ms'])
df_genre['Minutes'] = df_genre.Ms / 60000
df_genre[['Genre', 'Minutes']].sort_values('Minutes', ascending=False).head(20)