### How does a playlist’s genre diversity affect its popularity?

In [35]:
import os
import json
import pickle
from collections import defaultdict
from collections import Counter
import numpy as np
import matplotlib.pyplot
import matplotlib.axes
import matplotlib.colors
import math
from scipy.spatial.distance import cityblock, euclidean, braycurtis, cosine, minkowski, canberra

In [36]:
def get_distances(a,b):
    dist= []
    dist.append(cityblock(a,b))
    dist.append(euclidean(a,b))
    dist.append(braycurtis(a,b))
    dist.append(cosine(a,b))
    dist.append(minkowski(a,b))
    dist.append(canberra(a,b))
    return dist

In [37]:
MPD = 'mpd/'
# MPD = r'C:\Users\19142\Documents\gpjDATA\mpd\data'
playlist_files = [os.path.join(MPD, x) for x in os.listdir(MPD)]

In [38]:
SONG_DATA = 'mappings/songs'
# SONG_DATA = r'C:\Users\19142\Documents\gpjDATA\mappings\songs'
song_files = [os.path.join(SONG_DATA, x) for x in os.listdir(SONG_DATA)]

In [39]:
ARTIST_DATA = 'mappings/artists/artist_matching.pkl'
with open(ARTIST_DATA, 'rb') as fp:
    artist_dict = pickle.load(fp)

In [40]:
all_genres = set()
for ID, artist in artist_dict.items():
    all_genres.update(artist['genres'])
genres = list(all_genres)
genres.sort()
genre_to_ix = {g:ix for ix,g in enumerate(genres)}
ix_to_genre = {ix:g for ix,g in enumerate(genres)}
N = len(genres)
ideal_dist = np.ones(N)*1/N

In [41]:
N

5604

In [42]:
# Note: Here we compute this pet track but we can also take list with all genres in playlist as a whole

def calculate_playlist_vector(genres_per_track, dim=N, g_ix=genre_to_ix):
    k = len(genres_per_track) # weight uniformly across tracks
    dist = np.zeros(dim)
    for track_genres in genres_per_track:
        q = len(track_genres) # weight uniformly across genres per track
        for genre in track_genres:
            ix = g_ix[genre]
            dist[ix] += (1/k)*(1/q)
    return dist

In [43]:
songID_to_genres = defaultdict(list)
for file in song_files:
    with open(file, 'rb') as fp:
        data = pickle.load(fp)
    for songID,song in data.items():
        genres_list = []
        for artist in song['artists']:
            artistID = artist['id']
            genres = artist_dict[artistID]['genres']
            genres_list.extend(genres) # Note: by doing this we allow repeating
        songID_to_genres[songID] = genres_list

In [44]:
list(songID_to_genres.items())[:5]

[('spotify:track:1fyTBapjw8q9MlzLVdU6wg',
  ['alternative rock',
   'madchester',
   'new wave',
   'post-punk',
   'rock',
   'uk post-punk']),
 ('spotify:track:4cC9nQMgClmz8cix6l5CSX', ['filter house']),
 ('spotify:track:3qwh1awyjkXFUER1Pp64qv',
  ['christian rock', 'dreamo', 'piano rock']),
 ('spotify:track:6NHONOfLaDTtSG8WtCTpex', ['disco house', 'vocal house']),
 ('spotify:track:7HxecasMeh6aCAvQPGiFgP', ['deep ambient'])]

In [45]:
playlist_data = {}
for file in playlist_files:
    print('.', end='')
   # assert os.path.isfile(file) 
    with open(file, 'rb') as fp:
        data = json.load(fp)
    for playlist in data['playlists']:
        empty = False
        ID = playlist['pid']
        genres_per_track = []
        for track in playlist['tracks']:
            tID = track['track_uri']
            if len(songID_to_genres[tID]) != 0:
                genres_per_track.append(songID_to_genres[tID])
            else:
                empty = True
                break
        if empty:
            continue
        followers = playlist['num_followers']
        dist = calculate_playlist_vector(genres_per_track)
        diversity = get_distances(ideal_dist, dist)
        playlist_data[ID] = (diversity, followers)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [46]:
len(playlist_data)

409039

In [47]:
list(playlist_data.items())[:5]

[(962001,
  ([1.9835831548893639,
    0.43318300041595537,
    0.9917915774446825,
    0.9691771258611092,
    0.43318300041595537,
    5595.618427603336],
   1)),
 (962003,
  ([1.9718058529621698,
    0.1869062507074241,
    0.9859029264810849,
    0.9287112930205207,
    0.1869062507074241,
    5597.316792024452],
   1)),
 (962005,
  ([1.9807280513918628,
    0.2886070579527828,
    0.9903640256959314,
    0.9537641033959948,
    0.2886070579527828,
    5599.382363331856],
   1)),
 (962006,
  ([1.9600285510349749,
    0.20375050145651705,
    0.9800142755174874,
    0.9345784438880881,
    0.20375050145651705,
    5591.318864665253],
   1)),
 (962011,
  ([1.997501784439686,
    0.3798236439136597,
    0.998750892219843,
    0.964852008293013,
    0.3798236439136597,
    5603.982356514815],
   1))]

In [50]:
with open('mappings/Q1/playlist_dataMULTIPLE.pkl', 'wb') as fp:
    pickle.dump(playlist_data, fp)

In [51]:
list(playlist_data.items())[:5]

[(962001,
  ([1.9835831548893639,
    0.43318300041595537,
    0.9917915774446825,
    0.9691771258611092,
    0.43318300041595537,
    5595.618427603336],
   1)),
 (962003,
  ([1.9718058529621698,
    0.1869062507074241,
    0.9859029264810849,
    0.9287112930205207,
    0.1869062507074241,
    5597.316792024452],
   1)),
 (962005,
  ([1.9807280513918628,
    0.2886070579527828,
    0.9903640256959314,
    0.9537641033959948,
    0.2886070579527828,
    5599.382363331856],
   1)),
 (962006,
  ([1.9600285510349749,
    0.20375050145651705,
    0.9800142755174874,
    0.9345784438880881,
    0.20375050145651705,
    5591.318864665253],
   1)),
 (962011,
  ([1.997501784439686,
    0.3798236439136597,
    0.998750892219843,
    0.964852008293013,
    0.3798236439136597,
    5603.982356514815],
   1))]