In [None]:
import numpy as np
import pandas as pd
import random
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
SPOTIPY_CLIENT_ID='4f89cd57785747e1b1d7ee0b95e61985'
secret = '61708a8ace7647ba888be760fbc822d1'
genres_to_collect = ('alt-rock','classical', 'country',
                       'edm', 'heavy-metal',  'hip-hop',
                       'latin')

In [None]:
def create_credentials_obj():
    """
    return a spotipy object
    """
    client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp

In [None]:
def collect_seed_tracks(sp, genre_list, num_tracks_per_genre):
    """
    provide a genre list, number of tracks per genre
    return a dataframe of artist, track, genre
    """
    artist_genre_id_df = pd.DataFrame(columns=['artist_name', 'artist_id', 'track_name', 'track_id', 'genre'])
    if num_tracks_per_genre > 100:
        num_iterations = round(num_tracks_per_genre/100)
        num_tracks_per_genre = 100
    else:
        num_iterations = 1

    track_id_list = []
    for i in tqdm(range(num_iterations)):
        for genre in genre_list:
            tempRecs = sp.recommendations(seed_genres=[genre], limit=num_tracks_per_genre, market='US')
            for track in tempRecs['tracks']:
                temp_artist = track['artists'][0]['name']
                temp_artist_id = track['artists'][0]['id']
                temp_track_name = track['name']
                temp_track_id = track['id']
                # only assign track if it is not already in the list 
                # this avoids having tracks assigned to multiple genres
                if temp_track_id not in set(track_id_list):
                    track_id_list.append(temp_track_id)
                    # add row to dataframe
                    artist_genre_id_df.loc[len(artist_genre_id_df.index)] = [temp_artist, temp_artist_id, temp_track_name, temp_track_id, genre]
    return artist_genre_id_df


In [None]:
def collect_track_attributes(sp, artist_track_df):
    """
    provide a dataframe containing track_id
    return a merged df containing both features and artist info
    """
    num_tracks = len(artist_track_df)
    num_groups = int(num_tracks/95)
    track_features_df = pd.DataFrame()
    subDFs = np.array_split(artist_track_df, num_groups)
    for df in tqdm(subDFs):
        try:
            temp_features_list = sp.audio_features(df['track_id'].values)
            temp_features_list = [track for track in temp_features_list if track != None]
            temp_features_df = pd.DataFrame(temp_features_list)
            temp_features_df.rename(columns={'id': 'track_id'}, inplace=True)
            track_features_df = pd.concat([track_features_df, temp_features_df])
            # save the csv incase it gets stuck along the way.
            track_features_df.to_csv('iterative_features.csv')
        except:
            print('failure to load track values')
    result_df = pd.merge(track_features_df, artist_track_df, how='inner', on='track_id')
    result_df.to_csv('moderate_song_data.csv')
    return result_df

In [None]:
feature_artist_track_df.groupby('genre').count()
print(len(feature_artist_track_df))

165116


In [None]:


def spotify_data_booster(sp, tracks_df, num_iterations, genre_list):
    """
    read in a tracks dataframe

    use the tracks to seed more tracks

    return an updated version of the original dataframe.
    """

    track_id_list = list(tracks_df['track_id'].values)
    for i in tqdm(range(num_iterations)):
        for genre in genre_list:
            temp_df = tracks_df[tracks_df['genre']==genre]
            
            seed_tracks = list(temp_df['track_id'].sample(3))
            tempRecs = sp.recommendations(seed_genres=[genre], seed_tracks=seed_tracks, limit=95, market='US')
            for track in tempRecs['tracks']:
                temp_artist = track['artists'][0]['name']
                temp_artist_id = track['artists'][0]['id']
                temp_track_name = track['name']
                temp_track_id = track['id']
                # only assign track if it is not already in the list 
                # this avoids having tracks assigned to multiple genres
                if temp_track_id not in set(track_id_list):
                    track_id_list.append(temp_track_id)
                    # add row to dataframe
                    tracks_df.loc[len(tracks_df.index)] = [temp_artist, temp_artist_id, temp_track_name, temp_track_id, genre]
    return tracks_df

In [None]:
# integrate the pipeline together

# step 1 create a spotipy instance
sp_creds = create_credentials_obj()
artist_track_df = collect_seed_tracks(sp_creds, genres_to_collect, 1000)
print(len(artist_track_df))
artist_track_df = spotify_data_booster(sp_creds, artist_track_df, 100, genres_to_collect)
print(len(artist_track_df))
feature_artist_track_df = collect_track_attributes(sp_creds, artist_track_df)




100%|██████████| 10/10 [00:38<00:00,  3.82s/it]


2715


100%|██████████| 100/100 [08:03<00:00,  4.84s/it]


30990


100%|██████████| 326/326 [02:12<00:00,  2.45it/s]


In [None]:
#artist_track_df.to_csv('track_artist_165k.csv')

In [None]:
feature_artist_track_df.groupby('genre).count()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>