In [15]:
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import pandas as pd
import ast
from os import listdir
import json
import settings

## Connect to Spotify API with spotipy
Set client_id and client_secret for ClientCredential authenication for spotipy and connect to Spotify API through spotipy. 


In [16]:
#client_id and client_secret set in settings.py
client_credentials_manager = SpotifyClientCredentials(client_id = settings.client_id, 
                                                      client_secret = settings.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Read Spotify json Data into Python
We'll start by reading the provided json listening history files into python as a list of dictionaries containing the following information: <br>

1. endTime - The date and time at which listening to the track ended. <br>
2. artistName - The name of the artist of the track (only the releasing artist, not features or collaborations). <br>
3. trackName - The name of the track. Note that this includes songs and pocast episodes. <br>
4. msPlayed - The number of milliseconds of the track that were played. <br> <br>

We will also export this list of dictionaries as a csv of the full listening history for later anlysis. 

In [17]:
def convert_json(path: str = 'MyData'): 
    
    '''This function will return a list of dictionaries. Each dictionary will contain the trackName, 
    trackArtist, end date and time it was listened to, and how many milliseconds of the track were listened to. 
    How it does this is commented inline in the function''';
    
    # Create a list of all file paths in the folder specified in the function parameter that start with the string
    # 'StreamingHistory', which is how Spotify delivers the listening history files. 
    # StreamingHistory0, StreamingHistory1, etc
    files = ['MyData/' + x for x in listdir(path) if x.split('.')[0][:-1] == 'StreamingHistory']
    # Read each json file into python, confirm it is a python object iwth ast.literal_eval, and add each 
    # dictionary in the file to the end of a list and return the final list of dictionaries.  
    all_streamings = []
    for file in files:
        with open(file, 'r', encoding = 'UTF-8') as f:
            new_streamings = ast.literal_eval(f.read())
            all_streamings += [streaming for streaming in new_streamings]
    return all_streamings

#Run the function on the file path where the listening histories are located (path entered in settings.py)
streaming_list = convert_json()
#Display the first 25 items of the list.
streaming_list[:15]

[{'endTime': '2020-05-01 02:01',
  'artistName': 'SAINt JHN',
  'trackName': 'Roses - Imanbek Remix',
  'msPlayed': 176218},
 {'endTime': '2020-05-01 02:04',
  'artistName': 'Ehrling',
  'trackName': 'In My Soul',
  'msPlayed': 1909},
 {'endTime': '2020-05-01 02:04',
  'artistName': 'The Middle Coast',
  'trackName': 'Me Tonight',
  'msPlayed': 177426},
 {'endTime': '2020-05-01 02:07',
  'artistName': 'Kygo',
  'trackName': 'Like It Is',
  'msPlayed': 183067},
 {'endTime': '2020-05-01 02:10',
  'artistName': 'Tep No',
  'trackName': 'Breathe, Be Happy - French Braids Remix',
  'msPlayed': 171250},
 {'endTime': '2020-05-01 02:14',
  'artistName': 'Alesso',
  'trackName': 'One Last Time',
  'msPlayed': 240383},
 {'endTime': '2020-05-01 02:16',
  'artistName': 'MEDUZA',
  'trackName': 'Born To Love (feat. SHELLS)',
  'msPlayed': 162295},
 {'endTime': '2020-05-01 02:20',
  'artistName': 'Field Report',
  'trackName': 'Home (Leave the Lights On)',
  'msPlayed': 221146},
 {'endTime': '2020-0

In [12]:
#convert list of dictionaries to a dataframe and then save as a csv
df_history = pd.DataFrame(streaming_list)
df_history.to_csv('full_streaming_history.csv')

## Create Dictionary of TrackName: ArtistName Pairs for Unique Tracks
We will be querying the Spotify API for additional track information using the track name and artist name, so we will create a dictionary of all the unique tracks in our listening history to search over. We then check the length of this new dictionary and see that there are 3432 unique tracks in my listening history over the last year. 

In [5]:
#For each dictionary in the full list of dictionaries of tracks, check if the trackName is already in a new dictionary
#If it is not, add it with a value pair of the artistName. 
track_artist_dict = {} #unique track artist pairs
for item in streaming_list:
    if item['trackName'] not in track_artist_dict:
        track_artist_dict[item['trackName']] = item['artistName']
        
print(len(track_artist_dict))
track_artist_dict

3432


{'Roses - Imanbek Remix': 'SAINt JHN',
 'In My Soul': 'Ehrling',
 'Me Tonight': 'The Middle Coast',
 'Like It Is': 'Kygo',
 'Breathe, Be Happy - French Braids Remix': 'Tep No',
 'One Last Time': 'Alesso',
 'Born To Love (feat. SHELLS)': 'MEDUZA',
 'Home (Leave the Lights On)': 'Field Report',
 'Little Less Polite': 'Ripe',
 'Hoops - Acoustic': 'JONES',
 'Hoops': 'The Rubens',
 'Everytime': 'Medasin',
 'Love Is Alive (feat. Elohim) - Chet Porter Remix': 'Louis The Child',
 "Sure Don't Miss You (feat. The Dip)": 'BUNT.',
 'Backup': 'Ripe',
 'Heartless (with Julia Michaels & Morgan Wallen)': 'Diplo',
 'Freaks': 'FISHER',
 'Line Of Sight - Chet Porter Remix': 'ODESZA',
 'Modern Loneliness': 'Lauv',
 'The Difference': 'Flume',
 'HEART ATTACK (feat. lau.ra)': 'BRONSON',
 'Joe': 'Lemaitre',
 'Paradise': 'Golden Features',
 'Memories That You Call (feat. Monsoonsiren) - ODESZA & Golden Features VIP Remix': 'ODESZA',
 'VAULTS': 'BRONSON',
 "Don't Let Me Down": 'Milky Chance',
 'Coffee': 'Quinn 

## Counting the Number of Plays per Track
We'll now create a dictionary of the number of times each track in the total listening history was played. We'll merge this to our full data frame of tracks on the track name prior to our analysis. 

In [6]:
# Loop over the list of dictionaries and if the trackName is not in our count dictionary, add it and set value to one.
# If it is in the dictionary, increase the value by 1.
track_count_dict = {}
for item in streaming_list:
    if item['trackName'] in track_count_dict:
        track_count_dict[item['trackName']] += 1
    else:
        track_count_dict[item['trackName']] = 1

## Write Functions Using spotipy to Retrieve Track Information
[spotipy](https://spotipy.readthedocs.io/en/2.18.0/) is a handy python package that makes it straightforward to access the [Spotify API](https://developer.spotify.com/documentation/web-api/), so we'll be using it to gather additional information on the tracks in our listening history.  Spotify only gives us the Track Name and Artist Name in the listening history, and not their actual internal unique iD for the track that we can use to search for all of it's attributes. Therefore through spotipy and the Spotify API the first function we'll write retrieves the Spotify iD of each track by searching for the trackName and artistName. We'll also get the track type (song vs. podcast) while we're there, so that we can analyze our song vs. podcast listening habits. 

In [7]:
def search_track_name(track_name: str, artist_name: str) -> str:
    
    '''This function takes the track name and artist name as parameters and combines them into a string and
       searches Spotify first for tracks, and then if there are no results, searches for podcasts that match 
       the searched string. If there are no results for either it returns none. If a track or a podcast is 
       found it returns the track id and the track type.
       
       In the sp.search function type refers to the type of Spotify item to search (e.g. track, episode, artist,
       album).  ''';
    
    search_track = sp.search(q=track_name+' '+artist_name, offset=0, type='track', market='US')
    if len(search_track['tracks']['items']) == 0:
        search_episode = sp.search(q=track_name+' '+artist_name, offset=0, type='episode',market = 'US')
        if search_episode['episodes']['items'] == [None]:
            return None, None
        else:
            try:
                track_id = search_episode['episodes']['items'][0]['id']
                track_type = search_episode['episodes']['items'][0]['type']
                return track_id, track_type
            except:
                print('search failed', track_name)
                return None, None
    else:
        track_id = search_track['tracks']['items'][0]['id']
        track_type = search_track['tracks']['items'][0]['type']
        return track_id, track_type

## Function to Return Audio Features from Track iD
Audio features are attributes assigned to every song by Spotify. There are 18 items included in the Spotify AudioFeaturesObject for each song and they are described by Spotify as follows. These descriptions can be found from Spotify [here](https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject): <br>

<b>acousticness</b> - "A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic."<br>
analysis_url - "An HTTP URL to access the full audio analysis of this track. An access token is required to access this data." <br>
<b>danceability</b> - "Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable." <br>
<b>duration_ms</b> - "The duration of the track in milliseconds." <br>
<b>energy</b> - "Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy." <br>
<b>id</b> - "The Spotify ID for the track." <br>
<b>instrumentalness</b> - "Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0." <br>
<b>key</b> - "The key the track is in. Integers map to pitches using standard Pitch Class notation . E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on." <br>
<b>liveness</b> - "Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live." <br>
<b>loudness</b> - "The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db." <br>
<b>mode</b> - "Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0." <br>
<b>speechiness</b> - "Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks." <br>
<b>tempo</b> - "The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration." <br>
<b>time_signature</b> - "An estimated overall time signature of a track. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure)." <br>
<b>track_href</b> - "A link to the Web API endpoint providing full details of the track." <br>
<b>type</b> - "The object type: “audio_features""<br>
<b>uri</b> - "The Spotify URI for the track."<br>
<b>valence</b> - "A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry)."

In [32]:
   '''Note that only songs have audio features, podcasts do not. If we search an episode iD in this 
      function it will return none'''
    
def get_features(track_id: str) -> dict:
    if track_id == None:
        return None
    try:
        features = sp.audio_features([track_id])
        return features[0]
    except:
        return None

## Function to Return Track Popularity from Track iD
Note that podcast episodes do not have popularity. 

In [33]:
def track_popularity(track_id: str):
    if track_id == None:
        return None
    try:
        track_info = sp.track(track_id)
        track_pop = track_info['popularity']
        return track_pop
    except:
        return None

## Function to Return Artist Genres, id, Popularity, and Name from Track iD
We'll now retrieve the artist info for the primary artist on the track. it is worth noting that the genres returned here are all of the genres associated with the artist in their Spotify profile, not specifically associated with the track we are search. Also note that podcast artists will not have this information. 

In [34]:
def get_artist_info(track_id: str):
    if track_id == None:
        return None, None, None, None
    try:
        track_info = sp.track(track_id)
        artist_id = track_info['artists'][0]['id']
        artist_info = sp.artist(artist_id)
        artist_genres = artist_info['genres']
        artist_popularity = artist_info['popularity']
        artist_name = artist_info['name']
        return artist_id, artist_genres, artist_popularity, artist_name
    except: 
        return None, None, None, None

## Loop Over Dictionary of Unique trackName:artistName Pairs to retrieve all Track and Artist Info
Now it's time to put it all together and retrieve all this information for each track. We'll be searching for each track, storing all found information in a dictionary of dictionaries, and dealing with podcast episodes and tracks that search could not find at all. Step by step our process will be: <br><br>
    1. Set up an optional counter to be able to check in on the loop's progress.<br>
    2. Loop through the trackName:artistName dictionary of unique songs we created above. For each pair we will each step listed below <br>
    3. Connect to the API.<br>
    4. Retrieve the track ID and type and store them as variables with the first function we wrote `search_track_name()`
    5. Retrieve and store the track popularity using the `track_popularity()` function and the track ID. <br>
    6. Retrieve and store the artist info with the `get_artist_info()` function. <br>
    7. If the track type is episide or None we have enough information to know that we shouldn't waste time running the rest of the functions, so we will will out the fields in the final dictionary with the information we know. <br>
    8. If the track type is track we need to run the popularity, artist info, and features functions. We put those in an if statement with the type, run the functions and fill the information into the dictionary for the track. <br>
 

In [None]:
#Create dictionary that will house trackName:{trackFeature} key value pairs.
all_features = {}
#I included a list and counter to keep track of the loops progress as it takes a fair bit of time. 
counter_list = [50, 100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,
                2000,2100,2200,2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400]
counter = 0
for track, artist in track_artist_dict.items():
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    #Search for track and retrieve track id and track type (if search suceeeds)
    track_id, track_type = search_track_name(track, artist)
    #If the track_type is episode, create the track feature dictionary as no other functions work for podcasts
    if track_type == 'episode':
        additional_features = {'artist_name': artist, 'artist_genres': 'podcast', 'artist_popularity':None,
                               'track_popularity':None}
        all_features[track] = {'danceability': None, 'energy': None, 'key': None, 'loudness': None, 
                               'mode': None, 'speechiness': None, 'acousticness': None, 'instrumentalness': None, 
                               'liveness': None, 'valence': None, 'tempo': None, 'type': 'episode', 'id': track_id, 
                               'uri': None, 'track_href': None, 'analysis_url': None, 'duration_ms': 0, 
                               'time_signature': None}
        all_features[track].update(additional_features)
        
    #If the search did not find the track add an empty dictionary 
    elif track_type == None:
        unknown_features = {'danceability': None, 'energy': None, 'key': None, 'loudness': None, 'mode': None, 
                            'speechiness': None, 'acousticness': None, 'instrumentalness': None, 'liveness': None, 
                            'valence': None, 'tempo': None, 'type': None, 'id': None, 'uri': None, 
                            'track_href': None, 'analysis_url': None, 'duration_ms': 0, 'time_signature': None, 
                            'artist_name': artist, 'artist_genres': None, 'artist_popularity':None, 
                            'track_popularity':None}
        all_features[track] = unknown_features
    # If the track_type is track retrieve the track popularity, audio features, and artist info. Then fill out the
    # dictionary entry for the track with this information
    elif track_type == 'track':
        track_pop = track_popularity(track_id)
        artist_id, artist_genres, artist_popularity, artist_name = get_artist_info(track_id)
        additional_features = {'artist_name': artist, 'artist_genres': artist_genres, 'artist_popularity':artist_popularity, 'track_popularity':track_pop, 'type':'track'}
        features = get_features(track_id)
        if features:
            all_features[track] = features
            all_features[track].update(additional_features)
        else:  #In case there is a song with an id and without audio features. 
            all_features[track] = {'danceability': None, 'energy': None, 'key': None, 'loudness': None, 'mode': None, 'speechiness': None, 'acousticness': None, 'instrumentalness': None, 'liveness': None, 'valence': None, 'tempo': None, 'type': 'Track', 'id': None, 'uri': None, 'track_href': None, 'analysis_url': None, 'duration_ms': 0, 'time_signature': None, 'artist_name': artist, 'artist_genres': None, 'artist_popularity':None, 'track_popularity':None}
            all_features[track].update(additional_features)

    #Optional progress tracking code
    counter += 1
    if counter in counter_list:
        print(counter,' done')
        print('length all features ',len(all_features))
        
            
print('IDs and Features retrieved')        
    

## Finalize Formatting and Save Necessary .csv files
Our previous loop returned a dictionary with trackNames as keys and dictionaries with all the track inforation as values. To convert to a dataframe and save as a csv we'll make this into a list of dictionary. Each dictionary will have the name of the track and then all of the track information we just retrieved. 

In [None]:
with_features = []
for track_name, features in all_features.items():
    #unpack the dictionary that was the value in `all_features` and concatinate it with the name of the track. 
    with_features.append({'name': track_name, **features}) 

In [9]:
track_names = []
for key in track_count_dict:
    track_names.append(key)
track_plays = []
for key in track_count_dict:
    track_plays.append(track_count_dict[key])
play_count_df = {'trackName':track_names,
                         'trackPlays':track_plays}

In [None]:
df = pd.DataFrame(with_features)
df.to_csv('listening_history_unique_songs.csv')

In [10]:
df = pd.DataFrame(play_count_df)
df.to_csv('track_play_counts.csv')