# Spotify MPD Collaborative Filtering
- Idé: Rekommendera ett namn för givna spellistan genom att ge den samma namn som den närmaste playlisten.
- Skapa en funktion som konverterar spellistorna att ha bara artisten och namnet i sig.

In [1]:
import json
import os
import numpy as np
import math
from typing import List, Tuple

playlists = []
DATA_PATH = "mpd/data_samples"
REWRITE_DATA = False # Set to True to not use pickled vectors 

num_files = len(os.listdir(DATA_PATH))

# Load in every json file from the mpd/data_samples directory
for filename in os.listdir(DATA_PATH):
    with open(os.path.join(DATA_PATH, filename), "rt", encoding="utf-8") as f:
        playlists.extend(json.load(f)["playlists"])


In [2]:
unique_songs = sorted({f"{song['artist_name']} - {song['track_name']}"
                           for playlist in playlists
                           for i, song in enumerate(playlist["tracks"])})
print("Number of jsons: ", num_files)
print("Unique songs:", len(unique_songs))

Number of jsons:  1
Unique songs: 34250


In [3]:
# Sample of the song names
[f"{s['artist_name']} - {s['track_name']}" for i, s in enumerate(playlists[69]["tracks"][:5])]

['The Civil Wars - From This Valley',
 'Josh Garrels - Farther Along',
 'Ethan Pierce - Dark Skies',
 'Paramore - We Are Broken',
 "Brandi Carlile - That Wasn't Me"]

In [4]:
def make_playlist_vector(playlist: dict) -> List[int]:
    """
    Given a list of interests, produce a vector whose ith element is 1
    if unique_interests[i] is in the list, 0 otherwise
    """
    return [
        1
        if song
        in [f"{s['artist_name']} - {s['track_name']}" for i, s in enumerate(playlist["tracks"])]
        else 0
        for song in unique_songs #[:100] for testing
    ]

import time
import pickle

t0 = time.time()

try:
    if REWRITE_DATA:
        print("Rewriting data: Generating vectors and dumping them")
        playlist_vectors = [make_playlist_vector(playlist) for playlist in playlists] # [:100] for testing
        f = open("playlist_vectors.pickle", "wb")
        pickle.dump(playlist_vectors, f)
    else:
        f = open("playlist_vectors.pickle", "rb")
        print("Pickled data found.")
        playlist_vectors = pickle.load(f)
except(FileNotFoundError):
    print("Pickled data not found, generating vectors...")
    playlist_vectors = [make_playlist_vector(playlist) for playlist in playlists] # [:100] for testing
    f = open("playlist_vectors.pickle", "wb")
    pickle.dump(playlist_vectors, f)
finally:
    f.close()

t1 = time.time()
print(f"Time taken: {round(t1-t0, 2)} s")
print("vectors length:", len(playlist_vectors))

p = np.array(playlist_vectors)
sum(p.flatten())

Pickled data not found, generating vectors...
Time taken: 292.45 s
vectors length: 1000


66666

In [5]:
Vector = List[float]
pelles_interests = playlist_vectors[0]

def dot(v: Vector, w: Vector) -> float:
    """Computes v_1 * w_1 + ... + v_n * w_n"""
    assert len(v) == len(w), "vectors must be same length"

    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def cosine_similarity(v1: Vector, v2: Vector) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

pelles_similarities = [cosine_similarity(pelles_interests, playlist_vector_i)
                     for playlist_vector_i in playlist_vectors]

# Users 0 and 9 share interests in Hadoop, Java, and Big Data 
#assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests" 
 
# Users 0 and 8 share only one interest: Big Data 
#assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

pelles_similarities[9]

0.0

In [6]:
for i, sim in enumerate(pelles_similarities):
    if (sim > 0.08):
        print(f"Similarity to playlists[{i}]: ", sim)

Similarity to playlists[0]:  1.0
Similarity to playlists[38]:  0.15387460962256408
Similarity to playlists[123]:  0.11774956564686527
Similarity to playlists[225]:  0.10958925093990117
Similarity to playlists[245]:  0.08964374649555988
Similarity to playlists[262]:  0.23501778858208544
Similarity to playlists[355]:  0.15062893357603013
Similarity to playlists[359]:  0.09527699174583824
Similarity to playlists[380]:  0.13509395646699554
Similarity to playlists[663]:  0.1165103456070926
Similarity to playlists[717]:  0.16840826742715195
Similarity to playlists[721]:  0.15695698526580623
Similarity to playlists[734]:  0.09061915769440536
Similarity to playlists[747]:  0.1732981998492349
Similarity to playlists[812]:  0.1190826279056167
Similarity to playlists[844]:  0.0805716197153229
Similarity to playlists[908]:  0.13464028341974354
Similarity to playlists[944]:  0.1278274981412284
