# Data collection

## Importing libraries

In [1]:
import json
import os
import sys
import time
from copy import deepcopy
from random_words import RandomWords
import spotipy
from spotipy.client import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
import collections
import numpy as np

## Helper functions used to scrape Spotify

In [None]:
def get_auth_spotipy () -> Spotify:
    
    #Function that returns authorized Spotify client
    
    os.environ['SPOTIPY_CLIENT_ID'] = 'c4cd8ee33b624ca6b224debdef35ba58'
    os.environ['SPOTIPY_CLIENT_SECRET'] = '3d5127677713483d99ded163e45198c6'

    client_credentials_manager = SpotifyClientCredentials()
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp


def dump_data(data_to_json, file):
    
    #Function to save scraped data to json file
   
    
    with open(file,'w') as fd:
        json.dump(data_to_json, fd)


def load_data(file):
    
    # Function to load json file
    
    with open(file, 'r') as fd:
        data_from_json = json.load(fd)
        return data_from_json


def generate_search_word(predefined_words, number):
    
    #Function to generate a list of search words
    
    rw = RandomWords()
    random_words = rw.random_words(letter=None, count=number)
    words_to_search = predefined_words + random_words
    return list(set(words_to_search))


def get_playlists_by_search_word(words_to_search):
    
    #Function to retrieve playlist from Spotify API using search words
    
    all_playlists = []

    for word in words_to_search:
    #     time.sleep(1)
        try:
            print(word)
            playlists = sp.search(word, type='playlist', limit=50)

            for i, playlist in enumerate(playlists['playlists']['items']):
                playlist_data = {}
                user = playlist['owner']['id']

                try:
                    current_playlist = sp.user_playlist(user, playlist_id=playlist['id'])

                    tracks = current_playlist['tracks']['items']
                    track_ids = set()
                    for track in tracks:
                        if track['track']:
                            track_ids.add(track['track']['id'])

                    playlist_data['track_ids'] = list(track_ids) # convert the set of track_ids of a playlist into a list

                    playlist_data['num_followers'] = current_playlist['followers']['total']
                    playlist_data['collaborative'] = current_playlist['collaborative']
                    playlist_data['id'] = current_playlist['id']
                    playlist_data['owner_id'] = current_playlist['owner']['id']
                    playlist_data['num_tracks'] = current_playlist['tracks']['total']

                    all_playlists.append(playlist_data)

                except:
                    continue
        except:
            continue

    return all_playlists


def get_tracks_in_playlists(playlists):
    
    #Function to extract track features from playlists
    
    all_tracks = {}
    track_ids_set = set()
    len_playlists = len(playlists)
    for i, playlist in enumerate(playlists):
        sys.stdout.write('\r{0}% completed.'.format((float(i+1)/len_playlists)*100))
        sys.stdout.flush()

        track_ids = playlist['track_ids']
        track_ids = [i for i in track_ids if i] # remove NaNs in track_ids

        for track_id in track_ids:
            try:
                if track_id in track_ids_set:
                    continue
                track_ids_set.add(track_id)
                track_data = {}

                
                track = sp.track(track_id)
                track_data['id'] = track_id
                track_data['name'] = track['name']
                
                if 'explicit' in track:
                    track_data['explicit'] = track['explicit']
                if track['external_ids']:
                    track_data['isrc'] = track['external_ids']['isrc']
                
                track_data['num_available_markets'] = len(track['available_markets'])

                # Get audio_features of a track
                audio_features_list = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                     'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
                track_audio_feature = sp.audio_features(track_id)[0]
                if track_audio_feature:
                    for key in audio_features_list:
                        track_data[key] = track_audio_feature[key]
                track_data['popularity'] = track['popularity']
                
                
                # Features related to ALBUM
                album = sp.album(track['album']['id'])
                track_data['album_name'] = album['name']
                track_data['album_id'] = album['id']
                track_data['album_genres'] = album['genres']
                track_data['album_popularity'] = album['popularity']
                
                # Features related to ARTIST
                track_data['artists_names'] = []
                track_data['artists_ids'] = []
                track_data['artists_popularities'] = []
                track_data['artists_num_followers'] = []
                track_data['artists_genres'] = []
                
                for artist in track['artists']:
                    current_artist = sp.artist(artist['id'])
                    track_data['artists_names'].append(current_artist['name'])
                    track_data['artists_ids'].append(current_artist['id'])
                    track_data['artists_popularities'].append(current_artist['popularity'])
                    track_data['artists_num_followers'].append(current_artist['followers']['total'])
                    track_data['artists_genres'].extend(current_artist['genres'])
                
                
                track_data['avg_artist_popularity'] = np.mean(track_data['artists_popularities'])
                track_data['std_artist_popularity'] = np.std(track_data['artists_popularities'])
                track_data['avg_artist_num_followers'] = np.mean(track_data['artists_num_followers'])
                track_data['std_artist_num_followers'] = np.std(track_data['artists_num_followers'])
                
                if (track_data['artists_genres']):
                    # count the most freqent artist genre

                    counter = collections.Counter(track_data['artists_genres'])
                    track_data['mode_artist_genre'] = counter.most_common()[0][0]
                
                all_tracks[track_id] = track_data
                
            except:
                continue
    return all_tracks  


def missing_tracks(tracks_db, playlists):
    
    #Function that checks if tracks in playlists are missing from tracks database and returns a set of missing tracks

    
    missing_counts = 0
    missing_tracks = []
    # Loop over each playlist
    for index, playlist in enumerate(playlists):
        # get the list of track ids for playlist
        track_ids = playlist['track_ids']
        
        # check if tracks in playlist are in the track database
        for track_id in track_ids:
            # check if the track_id is in the tracks_db
            if track_id in tracks_db.keys():
                continue
            else:
                missing_counts += 1
                missing_tracks.append(track_id)
    print('tracks that are missing : {}'.format(missing_counts))
    return set(missing_tracks)


def get_tracks_by_track_ids(track_ids):
    
    #Function to retreive track features from track ids
    
    all_tracks = {}
    num_track_ids = len(track_ids)
    for i, track_id in enumerate(track_ids):
        try:
            time.sleep(1)
            sys.stdout.write('\r{0}% completed.'.format((float(i+1)/num_track_ids)*100))
            sys.stdout.flush()

            track_data = {}

            # Features related to the TRACK itself
            track = sp.track(track_id)
            track_data['id'] = track_id
            track_data['name'] = track['name']

            if 'explicit' in track:
                track_data['explicit'] = track['explicit']
            if track['external_ids']:
                track_data['isrc'] = track['external_ids']['isrc']

            track_data['num_available_markets'] = len(track['available_markets'])

            # Get audio_features of a track
            audio_features_list = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
            track_audio_feature = sp.audio_features(track_id)[0]
            if track_audio_feature:
                for key in audio_features_list:
                    track_data[key] = track_audio_feature[key]
            track_data['popularity'] = track['popularity']


            # Features related to the ALBUM
            album = sp.album(track['album']['id'])
            track_data['album_name'] = album['name']
            track_data['album_id'] = album['id']
            track_data['album_genres'] = album['genres']
            track_data['album_popularity'] = album['popularity']

            # Features related to the ARTIST
            track_data['artists_names'] = []
            track_data['artists_ids'] = []
            track_data['artists_popularities'] = []
            track_data['artists_num_followers'] = []
            track_data['artists_genres'] = []

            for artist in track['artists']:
                current_artist = sp.artist(artist['id'])
                track_data['artists_names'].append(current_artist['name'])
                track_data['artists_ids'].append(current_artist['id'])
                track_data['artists_popularities'].append(current_artist['popularity'])
                track_data['artists_num_followers'].append(current_artist['followers']['total'])
                track_data['artists_genres'].extend(current_artist['genres'])

            track_data['avg_artist_popularity'] = np.mean(track_data['artists_popularities'])
            track_data['std_artist_popularity'] = np.std(track_data['artists_popularities'])
            track_data['avg_artist_num_followers'] = np.mean(track_data['artists_num_followers'])
            track_data['std_artist_num_followers'] = np.std(track_data['artists_num_followers'])

            if (track_data['artists_genres']):
                # count the most freqent artist genre
                counter = collections.Counter(track_data['artists_genres'])
                track_data['mode_artist_genre'] = counter.most_common()[0][0]
        except:
            continue

        all_tracks[track_id] = track_data
            
    return all_tracks  

# initialize spotify authentication token
sp = get_auth_spotipy()

## We define the search words

In [None]:
# Specify 150 predefined_words search words
predefined_words = ['your', 'my', 'are', 'the', 'is', 'a', 'can', 'love', 'hate', 'holiday', 'work', 'workday',
              'weekend', 'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'beautiful',
              'fall', 'summer', 'spring', 'winter', 'classics',  'throwback', 'car', 'morning', 'shower', 'current', 
              'jesus', 'party', 'gym','late', 'night', 'old', 'chill', 'country', 'new', 'feel', 'good', 'workout', 
              'slow','hood','tropical', 'EDM', 'wedding', 'sex', 'honeymoon', 'senior', 'cool', 'house', 'jam',
              'today', 'top', 'hits', 'dance', 'mix', 'teen', 'cardio', 'hangouts', 'hot', 'ultimate',
              'hip hop', 'mega', 'upbeat', 'acoustic', 'deep', 'girls', 'baby', 'indie', 'punk', 'rock', 'family',
              'funk', 'jazz', 'instrumental', 'rap', 'beats', 'future', 'happy', 'emotional', 'great', 'magic',
              'finds', 'escape', 'fresh', 'high', 'low', 'buzz', 'kill', 'mood', 'blue' 'dirty', 'soul', 'pop',
              'beach', 'dream', 'shuffle', 'date', 'romantic', 'prom', 'college', 'kids', 'sleep', 'serenade',
              'calm', 'light', 'heavy', 'soft', 'strong', 'drama', 'confession', 'blink', 'sad', 'heart',
              'trend', 'trending', 'max', 'folk', 'blues', 'contemporary', 'electric', 'R&B', 'alternative', 
              'easy', 'metal', 'reggae', 'southern', 'cozy', 'darling', 'like', 'you', 'I', 'club', 'mind', 
              'waltz', 'glow', 'crazy', 'women', 'men', 'vibes', 'wave', 'trip', 'crave', 'him', 'break', 
              'true', 'different', 'her']
#Generate 200 search words combining 150 predefined_words and 50 random words
words_to_search = generate_search_word(predefined_words=predefined_words, number=50)

# Save search words (keywords) to json
dump_data(words_to_search, '../data/200_words_to_search.json')

# We query the spotify api with spotipy

In [None]:
# Load keywords
keywords = load_data('../data/200_words_to_search.json')

# Scrape the Spotify API for playlists that match these keywords
all_playlists = get_playlists_by_search_word(keywords)

# Save playlists to json
dump_data(all_playlists, '../data/playlists_from_200_words_to_search.json')

# Load all playlists
all_playlists = load_data('../data/playlists_from_200_words_to_search.json')

# Scrape track information from Spotify
tracks_2000 = get_tracks_in_playlists(all_playlists[:2000])
tracks_4000 = get_tracks_in_playlists(all_playlists[2000:4000])
tracks_6000 = get_tracks_in_playlists(all_playlists[4000:6000])
tracks_8000 = get_tracks_in_playlists(all_playlists[6000:8000])
tracks_9000 = get_tracks_in_playlists(all_playlists[8000:9000])
tracks_last = get_tracks_in_playlists(all_playlists[9000:])

# Save track information from json
dump_data(tracks_2000, '../data/tracks_2000.json')
dump_data(tracks_4000, '../data/tracks_2000_4000.json')
dump_data(tracks_6000, '../data/tracks_4000_6000.json')
dump_data(tracks_8000, '../data/tracks_6000_8000.json')
dump_data(tracks_9000, '../data/tracks_8000_9000.json')
dump_data(tracks_last, '../data/tracks_last.json')


# Load all track databases
tracks = {}
tracks_2000 = load_data('../data_archive/tracks_2000.json')
tracks_4000 = load_data('../data_archive/tracks_2000_4000.json')
tracks_6000 = load_data('../data_archive/tracks_4000_6000.json')
tracks_8000 = load_data('../data_archive/tracks_6000_8000.json')
tracks_9000 = load_data('../data_archive/tracks_8000_9000.json')
tracks_last = load_data('../data_archive/tracks_9000.json')

# Merge track dictionaries
tracks.update(tracks_2000)
tracks.update(tracks_4000)
tracks.update(tracks_6000)
tracks.update(tracks_8000)
tracks.update(tracks_9000)
tracks.update(tracks_last)

## Search for missing tracks

In [None]:
# Find missing tracks in the playlists
missing_2000 = missing_tracks(tracks_2000, all_playlists[:2000])
missing_4000 = missing_tracks(tracks_4000, all_playlists[2000:4000])
missing_6000 = missing_tracks(tracks_6000, all_playlists[4000:6000])
missing_8000 = missing_tracks(tracks_8000, all_playlists[6000:8000])
missing_9000 = missing_tracks(tracks_9000, all_playlists[8000:9000])
missing_last = missing_tracks(tracks_last, all_playlists[9000:])

# Re-scrape the API for the missing tracks
missing_tracks_2000 = get_tracks_by_track_ids(missing_2000)
missing_tracks_4000 = get_tracks_by_track_ids(missing_4000)
missing_tracks_6000 = get_tracks_by_track_ids(missing_6000)
missing_tracks_8000 = get_tracks_by_track_ids(missing_8000)
missing_tracks_9000 = get_tracks_by_track_ids(missing_9000)
missing_tracks_last = get_tracks_by_track_ids(missing_last)

# Merge re-scraped track dictionaries
tracks.update(missing_tracks_2000)
tracks.update(missing_tracks_4000)
tracks.update(missing_tracks_6000)
tracks.update(missing_tracks_8000)
tracks.update(missing_tracks_9000)
tracks.update(missing_tracks_last)

## Save the file

In [None]:
# Save the merged dictionaries as a single json file
dump_data(tracks, '../data_archive/tracks.json')