### How does a playlist’s genre diversity affect its popularity?

In [1]:
import os
import json
import pickle
from collections import defaultdict
from collections import Counter
import numpy as np

In [2]:
MPD = 'mpd/'
playlist_files = [os.path.join(MPD, x) for x in os.listdir(MPD)]

In [3]:
SONG_DATA = 'mappings/songs'
song_files = [os.path.join(SONG_DATA, x) for x in os.listdir(SONG_DATA)]

In [4]:
ARTIST_DATA = 'mappings/artists/artist_matching.pkl'
with open(ARTIST_DATA, 'rb') as fp:
    artist_dict = pickle.load(fp)

In [5]:
all_genres = set()
for ID, artist in artist_dict.items():
    all_genres.update(artist['genres'])
genres = list(all_genres)
genres.sort()
genre_to_ix = {g:ix for ix,g in enumerate(genres)}
ix_to_genre = {ix:g for ix,g in enumerate(genres)}
N = len(genres)
ideal_dist = np.ones(N)*1/N

In [6]:
N

5604

In [8]:
# Note: Here we compute this pet track but we can also take list with all genres in playlist as a whole

def calculate_playlist_vector(genres_per_track, dim=N, g_ix=genre_to_ix):
    k = len(genres_per_track) # weight uniformly across tracks
    dist = np.zeros(dim)
    for track_genres in genres_per_track:
        q = len(track_genres) # weight uniformly across genres per track
        for genre in track_genres:
            ix = g_ix[genre]
            dist[ix] += (1/k)*(1/q)
    return dist

In [12]:
songID_to_genres = defaultdict(list)
for file in song_files:
    with open(file, 'rb') as fp:
        data = pickle.load(fp)
    for songID,song in data.items():
        genres_list = []
        for artist in song['artists']:
            artistID = artist['id']
            genres = artist_dict[artistID]['genres']
            genres_list.extend(genres) # Note: by doing this we allow repeating
        songID_to_genres[songID] = genres_list

In [13]:
list(songID_to_genres.items())[:5]

[('spotify:track:1fyTBapjw8q9MlzLVdU6wg',
  ['alternative rock',
   'madchester',
   'new wave',
   'post-punk',
   'rock',
   'uk post-punk']),
 ('spotify:track:4cC9nQMgClmz8cix6l5CSX', ['filter house']),
 ('spotify:track:3qwh1awyjkXFUER1Pp64qv',
  ['christian rock', 'dreamo', 'piano rock']),
 ('spotify:track:6NHONOfLaDTtSG8WtCTpex', ['disco house', 'vocal house']),
 ('spotify:track:7HxecasMeh6aCAvQPGiFgP', ['deep ambient'])]

In [14]:
playlist_data = {}
for file in playlist_files:
    with open(file, 'rb') as fp:
        data = json.load(fp)
    for playlist in data['playlists']:
        empty = False
        ID = playlist['pid']
        genres_per_track = []
        for track in playlist['tracks']:
            tID = track['track_uri']
            if len(songID_to_genres[tID]) != 0:
                genres_per_track.append(songID_to_genres[tID])
            else:
                empty = True
                break
        if empty:
            continue
        followers = playlist['num_followers']
        dist = calculate_playlist_vector(genres_per_track)
        diversity = np.linalg.norm(ideal_dist - dist,2)
        playlist_data[ID] = (diversity, followers)

In [15]:
list(playlist_data.items())[:5]

[(962001, (0.4331830004159568, 1)),
 (962003, (0.18690625070742495, 1)),
 (962005, (0.28860705795278213, 1)),
 (962006, (0.20375050145651752, 1)),
 (962011, (0.3798236439136591, 1))]

In [16]:
with open('mappings/Q1/playlist_data.pkl', 'wb') as fp:
    pickle.dump(playlist_data, fp)