In [1]:
import numpy as np
import pandas as pd
import random
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
SPOTIPY_CLIENT_ID='4f89cd57785747e1b1d7ee0b95e61985'
secret = '61708a8ace7647ba888be760fbc822d1'
genres_to_collect = ('alt-rock','classical', 'country',
                       'edm', 'heavy-metal',  'hip-hop',
                       'latin')

In [3]:
# newest approach is to only search for genre and not let the spotify rec system provide tracks.
def create_credentials_obj():
    """
    return a spotipy object
    """
    client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp

In [25]:
def collect_artist_track_using_search(sp, genres_to_collect):
    """
    provide spotipy creds and list of genres

    return a dataframe of track artist name and id
    return 1k examples of each genre
    """
    # define some search criteria. This could be expanded in a better version of the function
    # for example iterate through each decade for each genre.
    year = '2000-2022'
    type = 'track'
    market = 'US'
    limit = 50
    num_batches = 20

    # initialize a df to store everything
    artist_genre_id_df = pd.DataFrame(columns=['artist_name', 'artist_id', 'track_name', 'track_id', 'genre'])

    for genre in tqdm(genres_to_collect):
        offset = 0
        for i in range(num_batches):
            temp_search = sp.search(q=f'genre:{genre} AND year:{year}', type=type, market=market, limit=limit, offset=offset)
            for result in temp_search['tracks']['items']:
                temp_artist = result['artists'][0]['name']
                temp_a_id = result['artists'][0]['id']
                temp_track = result['name']
                temp_t_id = result['id']
                artist_genre_id_df.loc[len(artist_genre_id_df.index)] = [temp_artist, temp_a_id, temp_track, temp_t_id, genre]
            offset = offset + 50
    return artist_genre_id_df


In [36]:
def collect_track_attributes(sp, artist_track_df, logging=False, write_results=False):
    """
    provide a dataframe containing track_id
    return a merged df containing both features and artist info
    """
    num_tracks = len(artist_track_df)
    num_groups = int(num_tracks/95)
    track_features_df = pd.DataFrame()
    subDFs = np.array_split(artist_track_df, num_groups)
    for df in tqdm(subDFs):
        try:
            temp_features_list = sp.audio_features(df['track_id'].values)
            temp_features_list = [track for track in temp_features_list if track != None]
            temp_features_df = pd.DataFrame(temp_features_list)
            temp_features_df.rename(columns={'id': 'track_id'}, inplace=True)
            track_features_df = pd.concat([track_features_df, temp_features_df])
            # save the csv incase it gets stuck along the way.
            if logging:
                track_features_df.to_csv('iterative_features.csv')
        except:
            print('failure to load track values')
    result_df = pd.merge(track_features_df, artist_track_df, how='inner', on='track_id')
    result_df = result_df.drop_duplicates()
    if write_results:
        result_df.to_csv('pure_genre_data.csv', index=False)
    return result_df

In [29]:
sp = create_credentials_obj()
artist_track_df = collect_artist_track_using_search(sp, genres_to_collect)

100%|██████████| 7/7 [00:42<00:00,  6.14s/it]
100%|██████████| 73/73 [00:12<00:00,  5.62it/s]


In [37]:
track_features_df = collect_track_attributes(sp, artist_track_df, write_results=True)

100%|██████████| 73/73 [00:13<00:00,  5.55it/s]


7000