# Helper Functions

In [1]:
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
import pymongo
from pymongo import MongoClient
from requests.exceptions import ReadTimeout
import time

def connect_mongo():
    # MongoDB Credentials
    username = "..."
    password = "..."
    cluster_name = "tune-playlists-serverle.04muk"
    dbname = "tune-playlists"
    cname = "Neo4J Staging Tracks"
    cname2 = "Neo4J Staging Playlists"

    # Set up a connection to the MongoDB Atlas cluster
    client = MongoClient(f"mongodb+srv://{username}:{password}@{cluster_name}.mongodb.net/{dbname}?retryWrites=true&w=majority")

    # Get the database
    db = client[dbname]
    
    # Get the collection
    collection = db[cname]
    
    # Get the collection
    collection2 = db[cname2]
    
    return client, collection, collection2

def connect_spotify():
    # Setup Spotify credentials and create API instance
    client_credentials_manager = SpotifyClientCredentials(client_id='...', client_secret="...")
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp

def API_seed_tracks(track_ids_set):
    # Connect to Spotify Web API
    sp = connect_spotify()
    
    # Connect to the MongoDB Atlas Database
    client, collection_tracks, collection_playlists = connect_mongo()
    
    # print progress-bar header
    print("Loads using seed tracks:")
    
    # For each genre...
    for track_id_x in tqdm(track_ids_set):
        # Create empty list to store genre's track inserts
        inserts = []
        
        # Create empty list to store genre's "playlist"
        doc_trackList = []
        
        # Retrieve 100 recommendations for genre (wait & retry if timeout)
        try:
            doc = sp.recommendations(seed_tracks=[track_id_x], limit=100)
        except ReadTimeout:
            time.sleep(60)
            doc = sp.recommendations(seed_tracks=[track_id_x], limit=100)
        
        # For each track recommended...
        for track in doc['tracks']:
            
            #print("test")

            # Extract track_id, album_id, and artist_ids
            track_id = track['id']
            album_id = track['album']['id']
            artist_ids = []
            artists = []
            track_genres = []
            
            #print(track['artists'])
            
            for artist in track['artists']:
                
                artist_id = artist['id']
                artist_ids.append(artist_id)
                
                try:
                    artist_details = sp.artist(artist_id)
                    #print(artist_details)
                    artists.append(artist_details)
                    for artist_genre in artist_details['genres']:
                        track_genres.append(artist_genre)
                except ReadTimeout:
                    time.sleep(60)
                    artist_details = sp.artist(artist_id)
                    artists.append(artist_details)
                    for artist_genre in artist_details['genres']:
                        track_genres.append(artist_genre)
            
            # Retrieve track audio features from api (wait & retry if timeout)
            try:
                audio_features = sp.audio_features(track_id)[0]
            except ReadTimeout:
                time.sleep(60)
                audio_features = sp.audio_features(track_id)[0]
            
            # Add audio features, genre, arist_ids, and album_id to doc for easy access
            track['audio_features'] = audio_features
            track['genre'] = list(set(track_genres))
            track['artists'] = artists
            track['artist_ids'] = artist_ids
            track['album_id'] = album_id
            
            # Add track_id to doc_tracklist
            doc_trackList.append(track_id)
            
            # Add doc to genere's inserts list
            inserts.append(track)
        try:
            # Insert tracks for genre
            collection_tracks.insert_many(inserts)
        
            # Insert playlist for genre
            collection_playlists.insert_one({'tracklist':doc_trackList})
        except TypeError:
            continue
    
    # End MongoDB connection
    client.close()

def API_seed_artists(artist_ids_set):
    # Connect to Spotify Web API
    sp = connect_spotify()
    
    # Connect to the MongoDB Atlas Database
    client, collection_tracks, collection_playlists = connect_mongo()
    
    # print progress-bar header
    print("Loads using seed artists:")
    
    # For each genre...
    for artist_id_x in tqdm(artist_ids_set):
        # Create empty list to store genre's track inserts
        inserts = []
        
        # Create empty list to store genre's "playlist"
        doc_trackList = []
        
        # Retrieve 100 recommendations for genre (wait & retry if timeout)
        try:
            doc = sp.recommendations(seed_artists=[artist_id_x], limit=100)
        except ReadTimeout:
            time.sleep(60)
            doc = sp.recommendations(seed_artists=[artist_id_x], limit=100)
            
        #print(doc)
        
        # For each track recommended...
        for track in doc['tracks']:

            # Extract track_id, album_id, and artist_ids
            track_id = track['id']
            album_id = track['album']['id']
            artist_ids = []
            artists = []
            track_genres = []
            
            #print(track['artists'])
            
            for artist in track['artists']:
                
                artist_id = artist['id']
                artist_ids.append(artist_id)
                
                try:
                    artist_details = sp.artist(artist_id)
                    #print(artist_details)
                    artists.append(artist_details)
                    for artist_genre in artist_details['genres']:
                        track_genres.append(artist_genre)
                except ReadTimeout:
                    time.sleep(60)
                    artist_details = sp.artist(artist_id)
                    artists.append(artist_details)
                    for artist_genre in artist_details['genres']:
                        track_genres.append(artist_genre)
            
            # Retrieve track audio features from api (wait & retry if timeout)
            try:
                audio_features = sp.audio_features(track_id)[0]
            except ReadTimeout:
                time.sleep(60)
                audio_features = sp.audio_features(track_id)[0]
            
            # Add audio features, genre, arist_ids, and album_id to doc for easy access
            track['audio_features'] = audio_features
            track['genre'] = list(set(track_genres))
            track['artists'] = artists
            track['artist_ids'] = artist_ids
            track['album_id'] = album_id
            
            # Add track_id to doc_tracklist
            doc_trackList.append(track_id)
            
            # Add doc to genere's inserts list
            inserts.append(track)
        try:
            # Insert tracks for genre
            collection_tracks.insert_many(inserts)
        
            # Insert playlist for genre
            collection_playlists.insert_one({'tracklist':doc_trackList})
        except TypeError:
            continue
    
    # End MongoDB connection
    client.close()


def sample_ids(num_playlists):
    
    # connect to MongoDB
    client, collection_tracks, collection_playlists = connect_mongo()
    
    # Calculate sample size for pipelines
    sample_size = num_playlists / 2

    # Use aggregate() with the $sample stage to sample 'id's
    track_ids_pipeline = [
        {'$sample': {'size': sample_size}},
        {'$project': {'_id': 0, 'id': 1}}
    ]
    
    # Use aggregate() with the $sample stage to sample 'id's
    track_ids_pipeline = [
        {'$sample': {'size': sample_size}},
        {'$project': {'_id': 0, 'id': 1}}
    ]
    
    # Use aggregate() with the $sample and $unwind stages to sample random 'artist_ids'
    artist_ids_pipeline = [
        {'$unwind': '$artist_ids'},
        {'$sample': {'size': sample_size}},
        {'$project': {'_id': 0, 'artist_id': '$artist_ids'}}
    ]

    sampled_track_ids = list(collection_tracks.aggregate(track_ids_pipeline))
    sampled_track_ids = [item['id'] for item in sampled_track_ids]
    sampled_artist_ids = list(collection_tracks.aggregate(artist_ids_pipeline))
    sampled_artist_ids = [item['artist_id'] for item in sampled_artist_ids]
    
    return sampled_track_ids, sampled_artist_ids

def API_2_MONGO(num_playlists):
    sampled_track_ids, sampled_artist_ids = sample_ids(num_playlists)
    API_seed_tracks(sampled_track_ids)
    API_seed_artists(sampled_artist_ids)

# Call Method

In [None]:
API_2_MONGO(250)

Loads using seed tracks:


  3%|██████▏                                                                                                                                                                                           | 4/125 [01:07<34:49, 17.27s/it]