In [None]:
import os
import sys
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
from urllib import request
from urllib.request import urlretrieve
import librosa
from librosa import display
import matplotlib.pyplot as plt
import skimage
from shutil import copy2

In [None]:
#get authorization to access spotify

def spotify_login(cid, secret):
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret) 
    return spotipy.Spotify(client_credentials_manager=client_credentials_manager)

cid = "ce04ddc6529f417bab8b71ac9174b9d7" 
secret = "5750250bbcd446609d76d2313801e78c"

sp = spotify_login(cid, secret)

In [None]:
tracks = {} 

In [None]:
#load spotify data into a dictionary

def add_playlist_to_dict(dictionary, user_id, playlist_id, genre):
    playlist_tracks = get_playlist_tracks(user_id, playlist_id)
    playlist_name = get_playlist_name(user_id, playlist_id)
    print("Adding " + playlist_name + ".")
    for i in range((len(playlist_tracks))):
        current_track = playlist_tracks[i]['track']
        if current_track:
            track_id = current_track['id']
            if track_id:
                dictionary[track_id] = {}
                track_data = dictionary[track_id]     
                artists = current_track['artists']
                track_data['playlist'] = playlist_name
                add_track_data(track_data, current_track, artists)
                add_playlist_genre(track_data, genre)
                add_artist_genres(track_data, artists)
                
def get_playlist_name(user_id, playlist_id):
    results = sp.user_playlist(user_id, playlist_id)
    name = results['name']
    return name

def get_playlist_tracks(user_id, playlist_id):
    results = sp.user_playlist_tracks(user_id, playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks
                
def add_track_data(track_data, current_track, artists):
    track_data['track_name'] = current_track['name']
    track_data['track_artists'] = [artist['name'] for artist in artists]
    track_data['preview_url'] = current_track['preview_url']
    
def add_playlist_genre(track_data, genre):
    track_data['genre_short'] = genre
        
def add_artist_genres(track_data, artists):
    artist_ids = [artist['id'] for artist in artists] #get a list of ids from the simplified artist objects
    artists = sp.artists(artist_ids) #use the list of ids to get the full artist objects
    track_data['genre_long'] = [genre for artist in artists['artists'] for genre in artist['genres']]

In [None]:
#query spotify for track data

def get_playlists_by_genre(genre, offset):
    results = sp.search(genre, offset=offset, limit=results_per_page, type='playlist')
    return results['playlists']['items']

def add_playlists(playlists):
    for playlist in playlists:
        playlist_id = playlist['id']
        user_id = playlist['owner']['id']
        if user_id and playlist_id:
            add_playlist_to_dict(tracks, user_id, playlist_id, genre)

def delay(count, interval):
    sleep_min = 2
    sleep_max = 5
    if count % interval == 0:
        time.sleep(np.random.uniform(sleep_min, sleep_max))


genres = ["pop", "rock", "rap", "metal", "house", "r&b", "classical", "techno", "jazz", "folk"]
results_per_page = 50
num_pages = 2
start_from = 0
request_count = 0
        
for genre in genres:
    print("=====Getting data for " + genre + ".=====")
    for offset in range(start_from, (results_per_page * num_pages), results_per_page):
        playlists = get_playlists_by_genre(genre, offset)
        add_playlists(playlists)
        request_count += 1
        delay(request_count, 5)


In [None]:
#convert to dataframe

df = pd.DataFrame.from_dict(tracks)
df = df.T
df['index'] = range(0, len(df))
df.reset_index(inplace=True)
df.rename(columns = {"level_0" : "track_id"}, inplace=True)
df.drop_duplicates('track_id', inplace=True)
df.sample(5)

In [None]:
df.to_csv(os.path.join(sys.path[0], "spotify_track_preview_data_2.csv"))
df['genre_short'].value_counts()

In [None]:
#download track previews from urls in dataframe

mp3_folder = "C:/Users/adamm/Documents/previews_from_spotify/"

preview_unavailable = []

for index, row in df.iterrows():
    track_id = row.track_id
    track_url = row.preview_url
    if track_url and not os.path.exists(mp3_folder + str(track_id) + ".mp3"):
        full_path = os.path.join(mp3_folder, str(track_id) + ".mp3")
        try:
            urlretrieve(track_url, full_path) #downloads the file from url
        except Exception as e:
            #print("Unable to download preview.")
            preview_unavailable.append(int(index))
    else:
        preview_unavailable.append(int(index))

num_downloaded = len(df.index) - len(preview_unavailable)
print("Downloaded " +  num_downloaded + "/" + len(df.index) + " mp3 previews.")

df.drop(df.index[preview_unavailable], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#convert mp3 to a melspectogram and save as png

spectogram_folder = "C:/Users/adamm/Documents/preview_spectograms/"

def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

for index, row in df.iterrows():
    track_id = str(row.track_id)
    genre_folder = str(row.genre_short) + "/"
    prefix = track_id[0] + "/"
    full_directory = spectogram_folder + genre_folder + prefix
    
    if not os.path.exists(full_directory + track_id + ".png"):
  
        #make a spectogram from an mp3 file
        file_name = mp3_folder + track_id + ".mp3"
        y, sr = librosa.load(file_name)
        spect = librosa.feature.melspectrogram(y=y, sr=sr)
        spect = np.log(spect + 1e-9)
        spect = scale_minmax(spect, 0, 255).astype(np.uint8)
        spect = np.flip(spect, axis=0) #low freq. at the bottom
        spect = 255 - spect
        
        #save the spectogram in the relevant sub-folder
        if not os.path.isdir(full_directory):
            Path(full_directory).mkdirs()
        skimage.io.imsave(full_directory + track_id + ".png", spect)

In [None]:
#create a small sample dataset

sample_size = 100
sample_categories = ['techno', 'classical']
full_dataset = pd.read_csv(os.path.join(sys.path[0], "spotify_track_preview_data.csv"))
samples = []

for category in sample_categories:
    genre_data = full_dataset[full_dataset['genre_short'] == category]
    genre_data = genre_data[pd.notnull(genre_data['preview_url'])]
    genre_data = genre_data[genre_data['genre_short'] == category].sample(sample_size) 
    samples.append(genre_data)
    
sample_data = pd.concat(samples)
sample_data.reset_index(drop=True, inplace=True)
del sample_data['Unnamed: 0']

sample_data.to_csv(os.path.join(sys.path[0], "spotify_track_preview_data_sample.csv"))

In [None]:
#copy spectograms of selected sample into a separate folder

dataset_folder = "C:/Users/adamm/Documents/preview_spectograms/"
sample_folder = "C:/Users/adamm/Documents/preview_spectogram_samples/"

for index, row in sample_data.iterrows():
    image_name = str(row.track_id) + ".png"
    copy2(dataset_folder + image_name, sample_folder + image_name)