## Collect audio features per track per genre per year from Spotify API
We use the spotipy library to connect to Spotify's API and retrieve 1000 tracks per genre per year for all tracks.

In [1]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

Class that takes in as parameters a genre, date range (in years), and number of tracks per genre.

In [2]:
class SpotifyAudioFeatures:
    def __init__(self, genre, years, n_tracks):
        self.genre = genre
        self.years = years
        self.sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="207ebcd660fa4fedbbcc7a351776736b",
                                                           client_secret="6a71978c6ed34608a10b1525d97c3fdb"))
        self.n_tracks = n_tracks # number of tracks we want to get per year. Must be a multiple of 50.

    
    def _get_tracks(self, genre, year):
        """
            Get n tracks for a single genre in a single year.
            n must be a multiple of 50.
        """
        tracks = []
        query = f'year:{year} genre:{genre}'
        offset = 0
        for i in range(0, self.n_tracks, 50):
            offset = i
            # results for tracks in the genre in a year
            results = self.sp.search(q=query, type='track', market='US', offset=offset, limit=50)
            for item in results['tracks']['items']:
                tracks.append(item)        
        return tracks
    
    def _get_tracks_for_x_years(self, year_range, genre):
        # Get n tracks for a range of years per genre
        all_tracks = []
        for year in year_range:
            search_results = self._get_tracks(genre, year)
            df = pd.DataFrame(search_results)
            df["year"] = year
            all_tracks.append(df)
        return all_tracks
    
    def _get_search_table_indexed_by_uri(self, table):
        search_table_by_ids = pd.DataFrame(table, copy=True)
        search_table_by_ids = search_table_by_ids.set_index('uri')
        search_table_by_ids = search_table_by_ids.drop_duplicates(subset=['id'])
#         print("Search Table indexed by id head:")
#         display(search_table_by_ids.head())
        return search_table_by_ids
    
    def _get_track_ids(self, table):
        # Get list of track ids
        return table.index
    
    def _get_audio_features(self, ids, table):
        track_afs = []
        counter = 0
        for i in ids:
            counter += 1
            if counter %100 == 0:
                print("Track ", counter)
            result = self.sp.audio_features(tracks=i)
            r = pd.DataFrame(result)
            # Add explicit, popularity, & name from d3 table to audio_features table
            try:
                row = pd.Series(table.loc[i], copy=True)
            except Exception:
                print("ID: ", i)
                print(Exception)
                continue
            r['name'] = row['name']
            r['explicit'] = row['explicit']
            r['popularity'] = row['popularity']
            r['year'] = row['year']
            r['release_date'] = row['album']['release_date']
            track_afs.append(r)
        # Returns a list of DataFrames
        return track_afs
    
    def _get_audio_features_dfs(self, track_ids):
        # Spotify has a max of 500 calls for audio_features.
        # Need to loop through track id's incrementally to get audio features for all tracks.
        counter = 0
        # A list of lists of DataFrames
        track_ids_split = [track_ids[i:i + 500] for i in range(0, len(track_ids), 500)]
        print("Track IDs split length (500 ids per split): ", len(track_ids_split))

        audio_features_dfs = []
        for tid in track_ids_split:
            try:
                af_dfs = self._get_audio_features(tid, self.search_table_by_ids)
                audio_features_dfs.append(af_dfs)
                print("Done with loop ", counter)
                counter += 1
            except:
                pass
        
        # flatten audio_features_dfs (array of arrays of DataFrames) to one array of dataframes
        self.afs = [item for sublist in audio_features_dfs for item in sublist]
        audio_features_dfs = pd.concat(self.afs)
        audio_features_dfs['genre'] = self.genre

        # Returns 1 DataFrame
        return audio_features_dfs
    
    def _export_to_csv(self, table):
        filename = f'./{self.genre}_audio_features'
        table.to_csv(filename)
        print(f'\nEXPORTED {filename}.csv')
    
    
    def main(self):
        tracks_df = self._get_tracks_for_x_years(self.years, self.genre)
        d3 = pd.concat(tracks_df)
        print("Tracks returned shape: ", d3.shape)
        
        self.search_table_by_ids = self._get_search_table_indexed_by_uri(d3)
        track_ids = self._get_track_ids(self.search_table_by_ids)
        print("Length of track ids: ", len(track_ids))
        
        # save Audio Feature DataFrames for later obervation (so we don't have to run this each time)
        self.audio_features_dfs = self._get_audio_features_dfs(track_ids)
        print("Audio Features DataFrame shape: ", self.audio_features_dfs.shape)
        
        # export resulting table to csv
        self._export_to_csv(self.audio_features_dfs)
        
        

In [3]:
genres = ['country', 'jazz', 'latin', 'pop', 'r&b']
timeframe = np.arange(2002, 2021)
n_tracks = 200

Example of a small sample size:

In [None]:
test_years = np.arange(2018, 2021)
SAF = SpotifyAudioFeatures(genre=genres[0], years=test_years, n_tracks=100)

In [None]:
SAF.main()

In [None]:
SAF.afs[0]

In [None]:
SAF.audio_features_dfs.head()

In [None]:
SAF.audio_features_dfs.columns

In [None]:
import time # sleep for a seconds between requests
seconds = 1

for genre in genres:
    SAF = SpotifyAudioFeatures(genre=genre, years=timeframe, n_tracks=n_tracks)
    SAF.main()
    # Reached MAX tries halfway through genres. 
    # Need a break before trying again
    time.sleep(seconds)

Country genre

In [4]:
SAF = SpotifyAudioFeatures(genre=genres[0], years=timeframe, n_tracks=n_tracks)
SAF.main()

Tracks returned shape:  (3800, 18)
Length of track ids:  3800
Track IDs split length (500 ids per split):  8
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  0
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  1
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  2
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  3
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  4
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  5
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  6
Track  100
Track  200
Track  300
Done with loop  7
Audio Features DataFrame shape:  (3800, 24)

EXPORTED ./country_audio_features.csv


Jazz

In [5]:
SAF = SpotifyAudioFeatures(genre=genres[1], years=timeframe, n_tracks=n_tracks)
SAF.main()

Tracks returned shape:  (3762, 18)
Length of track ids:  3694
Track IDs split length (500 ids per split):  8
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  0
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  1
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  2
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  3
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  4
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  5
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  6
Track  100
Done with loop  7
Audio Features DataFrame shape:  (3694, 24)

EXPORTED ./jazz_audio_features.csv


Latin

In [6]:
SAF = SpotifyAudioFeatures(genre=genres[2], years=timeframe, n_tracks=n_tracks)
SAF.main()

Tracks returned shape:  (3800, 18)
Length of track ids:  3800
Track IDs split length (500 ids per split):  8
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  0
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  1
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  2
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  3
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  4
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  5
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  6
Track  100
Track  200
Track  300
Done with loop  7
Audio Features DataFrame shape:  (3800, 24)

EXPORTED ./latin_audio_features.csv


Pop

In [7]:
SAF = SpotifyAudioFeatures(genre=genres[3], years=timeframe, n_tracks=n_tracks)
SAF.main()

Tracks returned shape:  (3800, 18)
Length of track ids:  3749
Track IDs split length (500 ids per split):  8
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  0
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  1
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  2
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  3
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  4
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  5
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  6
Track  100
Track  200
Done with loop  7
Audio Features DataFrame shape:  (3749, 24)

EXPORTED ./pop_audio_features.csv


R&B

In [8]:
SAF = SpotifyAudioFeatures(genre=genres[4], years=timeframe, n_tracks=n_tracks)
SAF.main()

Tracks returned shape:  (3800, 18)
Length of track ids:  3796
Track IDs split length (500 ids per split):  8
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  0
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  1
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  2
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  3
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  4
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  5
Track  100
Track  200
Track  300
Track  400
Track  500
Done with loop  6
Track  100
Track  200
Done with loop  7
Audio Features DataFrame shape:  (3796, 24)

EXPORTED ./r&b_audio_features.csv
