In [2]:
import os
import json
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm

## Preprocessing

In [3]:
# isolate type6 and type7 testing
with open('challenge_set.json', 'r') as file:
    challenge_set = json.load(file)

    type_6 = challenge_set['playlists'][5000:6000]
    type_7 = challenge_set['playlists'][6000:7000]
    
    with open('challenge_type_6.json', 'w') as f:
        json.dump(type_6, f)
        
    with open('challenge_type_7.json', 'w') as f:
        json.dump(type_7, f)

## Jaccard

In [4]:
import pandas as pd


def jaccard(s1, s2):
    numer = len(s1 & s2)
    denom = len(s1 | s2)
    return numer / denom if denom else 0


def pre_calc_jaccard(dataset):
    jaccard_dict = {}
    
    unique_track = set()
    for sample in tqdm(type_7):
        playlist = [t["track_uri"] for t in sample["tracks"]]
        unique_track.update(playlist)
        
    for track in tqdm(all_tracks):
        for ref_track in unique_track:
            jaccard_dict[(track, ref_track)] = jaccard(playlist_per_track[track], playlist_per_track[ref_track])
    return jaccard_dict


def apc(dataset):
    predictions = []
    for sample in tqdm(dataset):
        k = sample["num_holdouts"]
        playlist = [t["track_uri"] for t in sample["tracks"]]

        tracks_scored = []
        for track in tqdm(all_tracks):
            if track in playlist: # skip tracks thats already in `playlist`
                continue

            jaccard_sum = 0
            for ref_track in playlist:
                jaccard_sum += jaccard(playlist_per_track[track], playlist_per_track[ref_track])

            tracks_scored.append((jaccard_sum, track))

        tracks_scored.sort(reverse=True)
        recommend_tracks = [t[1] for t in tracks_scored[:k]]
        
        pred = {
            "pid": sample['pid'],
            "tracks": playlist + recommend_tracks
        }
        predictions.append(pred)
    
    df = pd.DataFrame(predictions)
    return df

In [5]:
# map pid to playlist name
playlist_name = {}

track_per_playlist = defaultdict(set)
playlist_per_track = defaultdict(set)

path_home = os.path.expanduser('~')
path_data = os.path.join(path_home, 'spotify_mpd/data')


for filename in tqdm(os.listdir(path_data)[:100]):
    full_path = os.path.join(path_data, filename)
    with open(full_path, 'r') as file:
        mpd_slice = json.load(file)
        
        for playlist in mpd_slice['playlists']:
            pid = playlist['pid']
            
            playlist_name[pid] = playlist['name']
            
            for track in playlist['tracks']:
                track_uri = track['track_uri']
                
                track_per_playlist[pid].add(track_uri)
                playlist_per_track[track_uri].add(pid)

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
all_tracks = list(playlist_per_track.keys())

## Precalculated

In [None]:
with open('challenge_type_6.json', 'r') as f:
    type_6 = json.load(f)

jaccard_dict = pre_calc_jaccard(type_6)

## Non-precalculated

In [None]:
with open('challenge_type_7.json', 'r') as f:
    type_7 = json.load(f)
    
df_pred_type7 = apc(type_7[:1]) 

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df_pred_type7)

In [None]:
type_7[0]