# Spotify MPD Collaborative Filtering
- Idé: Rekommendera ett namn för givna spellistan genom att ge den samma namn som den närmaste playlisten.
- Skapa en funktion som konverterar spellistorna att ha bara artisten och namnet i sig.

In [1]:
import json
import os
import numpy as np
import math
from typing import List, Tuple, Union
from collections import defaultdict
import random
import time
import pickle


class Collaborative_Recommender:
    Vector = List[int]
    a = "heyo"

    def __init__(self, path: str, rewrite_data: bool = False, playlist_limit: int = 0) -> None:
        self.playlists = []
        self.path = path
        self.rewrite_data = rewrite_data
        self.load_data(path, playlist_limit)
        self.named_playlists = self.to_track_names(self.playlists)
        self.a = "privet"

    def load_data(self, data_path: str, limit: int = 0):
        try:
            for filename in os.listdir(data_path):
                with open(os.path.join(data_path, filename), "rt", encoding="utf-8") as f:
                    self.playlists.extend(json.load(f)["playlists"])
            print(f"Loaded {len(os.listdir(data_path))} jsons")

            # Shuffle playlist as to reduce possible bias (a pickled playlist would overwrite this)
            random.shuffle(self.playlists)
            if limit > 0:
                self.playlists = self.playlists[:limit]
        except FileNotFoundError as e:
            print("File not found:", e)

    def extract_unique_songs(self):
        self.unique_songs = sorted(
            {
                f"{song['artist_name']} - {song['track_name']}"
                for playlist in self.playlists
                for i, song in enumerate(playlist["tracks"])
            }
        )
        print(f"Found {len(self.unique_songs)} unique songs")

    def sample_tracks(self, pos: int, num: int):
        for s in self.playlists[pos]["tracks"][:num]:
            print(f"{s['artist_name']} - {s['track_name']}")

    def to_track_names(self, playlists):
        return [
            [f"{s['artist_name']} - {s['track_name']}" for i, s in enumerate(playlist["tracks"])]
            for playlist in playlists
        ]

    def make_playlist_vector(self, playlist: Union[dict, List[str]]) -> List[int]:
        """
        Given a list of interests, produce a vector whose ith element is 1
        if unique_songs[i] is in the list, 0 otherwise
        """

        # Checking whether it's a dict or list, since one can supply a list of song names too
        if isinstance(playlist, dict):
            return [
                1
                if song
                in [
                    f"{s['artist_name']} - {s['track_name']}"
                    for i, s in enumerate(playlist["tracks"])
                ]
                else 0
                for song in self.unique_songs  # [:100] for testing
            ]
        elif isinstance(playlist, List):
            return [1 if song in playlist else 0 for song in self.unique_songs]

    def make_user_based_vectors(self):

        t0 = time.time()

        try:
            if self.rewrite_data:
                print("Rewriting data: Generating vectors and dumping them")
                self.playlist_vectors = [
                    self.make_playlist_vector(playlist) for playlist in self.playlists
                ]
                vector_file = open("playlist_vectors.pickle", "wb")
                playlist_file = open("playlists.pickle", "wb")
                print("Pickling...")
                pickle.dump(self.playlist_vectors, vector_file)
                pickle.dump(self.playlists, playlist_file)
            else:
                vector_file = open("playlist_vectors.pickle", "rb")
                playlist_file = open("playlists.pickle", "rb")

                print("Pickled data found.")
                self.playlist_vectors = pickle.load(vector_file)
                self.playlists = pickle.load(playlist_file)
        except (FileNotFoundError):
            print("Pickled data not found, generating vectors...")
            self.playlist_vectors = [
                self.make_playlist_vector(playlist) for playlist in self.playlists
            ]
            vector_file = open("playlist_vectors.pickle", "wb")
            playlist_file = open("playlists.pickle", "wb")
            print("Pickling...")
            pickle.dump(self.playlist_vectors, vector_file)
            pickle.dump(self.playlists, playlist_file)
        finally:
            vector_file.close()
            playlist_file.close()

        t1 = time.time()
        print(f"Time taken: {round(t1-t0, 2)} s")
        print("playlist_vectors length:", len(self.playlist_vectors))
        print("Sum of the vectors(useless metric):", sum(np.array(self.playlist_vectors).flatten()))

    def make_item_based_vectors(self):
        self.song_playlist_matrix: List[List[int]] = [
            [playlist_vector[i] for playlist_vector in self.playlist_vectors]
            for i, _ in enumerate(self.unique_songs)
        ]

    def dot(self, v: Vector, w: Vector) -> float:
        """Computes v_1 * w_1 + ... + v_n * w_n"""
        assert len(v) == len(w), "vectors must be same length"

        return sum(v_i * w_i for v_i, w_i in zip(v, w))

    def cosine_similarity(self, v1: Vector, v2: Vector) -> float:
        return self.dot(v1, v2) / math.sqrt(self.dot(v1, v1) * self.dot(v2, v2))

    def compute_similarities(self, pv) -> List[float]:
        return [self.cosine_similarity(pv, pv_i) for pv_i in self.playlist_vectors]

    def sort_similar_playlists(self, similarities: List[float], user_id: int):
        id_similarity = [
            (id, similarity)
            for id, similarity in enumerate(similarities)
            if id != user_id and similarity > 0
        ]
        return sorted(id_similarity, key=lambda pair: pair[-1], reverse=True)

    def user_based_suggestions(
        self,
        similarities,
        max_suggestions: int = 10,
    ):
        # Sum up the similarities
        suggestions: Dict[str, float] = defaultdict(float)
        for other_user_id, similarity in self.sort_similar_playlists(similarities, self.user_id):
            for song in self.named_playlists[other_user_id]:
                suggestions[song] += similarity

        # Convert them to a sorted list
        suggestions = sorted(suggestions.items(), key=lambda pair: pair[-1], reverse=True)

        # Exclude the user_id's supplied songs
        return [
            (suggestion, weight)
            for suggestion, weight in suggestions
            if suggestion not in self.named_playlists[self.user_id]
        ][:max_suggestions]

    def user_based_recommendation(self, playlist: List[str], limit: int = 10, user_id: int = -1):
        self.user_id = user_id
        similarities = self.compute_similarities(self.make_playlist_vector(playlist))
        suggestions = self.user_based_suggestions(similarities=similarities, max_suggestions=limit)
        return suggestions

    def compute_song_similarities(self, song_id: int) -> List[float]:
        return [self.cosine_similarity(self.song_playlist_matrix[song_id], pl_vector_j)
                    for pl_vector_j in self.song_playlist_matrix]

    def most_similar_songs_to(self,song_id: int) -> List[Tuple[str, float]]:
        similarities = self.compute_song_similarities(song_id)
        song_similarity_pairs = [(self.unique_songs[other_song_id], similarity)
        for other_song_id, similarity in enumerate(similarities)
        if song_id != other_song_id and similarity > 0
        ]
        return sorted(song_similarity_pairs,key=lambda pair: pair[-1], reverse=True)
        

recommender = Collaborative_Recommender("mpd/data_samples", rewrite_data=False)
recommender.extract_unique_songs()
recommender.sample_tracks(69, 5)
recommender.make_user_based_vectors()
recommender.make_item_based_vectors()


Loaded 1 jsons
Found 34250 unique songs
Ted Leo and the Pharmacists - Counting Down the Hours
(Sandy) Alex G - Kicker
Edward Sharpe & The Magnetic Zeros - Hot Coals
Coldplay - Birds
Hippo Campus - Violet
Pickled data found.


UnsupportedOperation: read

In [None]:
for i, song in enumerate(recommender.unique_songs):
    if song == 'Nirvana - Smells Like Teen Spirit':
        print("Song is at index:", i, ":", recommender.unique_songs[i])

In [66]:
pelle_id = 302
pelles_playlist = recommender.playlists[pelle_id]
sug = recommender.user_based_recommendation(pelles_playlist, 12, pelle_id)
print("Suggestions:")
display(sug)

Suggestions:


[('Nirvana - Smells Like Teen Spirit', 0.1404878717372541),
 ('Pearl Jam - Even Flow', 0.1404878717372541),
 ('Fugees - Killing Me Softly with His Song', 0.11407099568136256),
 ('Sir Mix-A-Lot - Baby Got Back', 0.11194039176134767),
 ('Cypress Hill - Insane in the Brain', 0.10191046736635467),
 ('Rob Base & DJ EZ Rock - It Takes Two', 0.09851875642131602),
 ('Weezer - Island In The Sun', 0.0936585811581694),
 ('Nirvana - Come As You Are', 0.0936585811581694),
 ('Foo Fighters - Everlong', 0.0936585811581694),
 ('Tag Team - Whoomp! There It Is', 0.0936585811581694),
 ('Depeche Mode - Personal Jesus', 0.0936585811581694),
 ('Fuel - Shimmer - Single Version', 0.0936585811581694)]

In [68]:
pelles_custom_playlist = [
    "Digital Underground - The Humpty Dance",
    "Naughty By Nature - Hip Hop Hooray",
    "Naughty By Nature - O.P.P.",
    "Busta Rhymes - Woo Hah!! Got You All In Check - Explicit LP Version",
    "Black Sheep - The Choice Is Yours (Revisited)",
    "Luke - I Wanna Rock",
    "A Tribe Called Quest - Buggin' Out",
    "Eric B. & Rakim - Know The Ledge",
    "Nas - The World Is Yours",
    "A Tribe Called Quest - Scenario - LP Mix",
    "Gil Scott-Heron - Home Is Where the Hatred Is",
    "Digable Planets - Rebirth of Slick (Cool Like Dat)",
    "Public Enemy - Rebel Without A Pause",
    "Onyx - Slam",
    "Parliament - Let's Play House",
    "AMG - Bitch Betta Have My Money",
]

print("The same playlist but supplied as a list of track names")
display(recommender.user_based_recommendation(pelles_custom_playlist, 12, pelle_id))

The same playlist but supplied as a list of track names


[('Nirvana - Smells Like Teen Spirit', 0.1404878717372541),
 ('Pearl Jam - Even Flow', 0.1404878717372541),
 ('Fugees - Killing Me Softly with His Song', 0.11407099568136256),
 ('Sir Mix-A-Lot - Baby Got Back', 0.11194039176134767),
 ('Cypress Hill - Insane in the Brain', 0.10191046736635467),
 ('Rob Base & DJ EZ Rock - It Takes Two', 0.09851875642131602),
 ('Weezer - Island In The Sun', 0.0936585811581694),
 ('Nirvana - Come As You Are', 0.0936585811581694),
 ('Foo Fighters - Everlong', 0.0936585811581694),
 ('Tag Team - Whoomp! There It Is', 0.0936585811581694),
 ('Depeche Mode - Personal Jesus', 0.0936585811581694),
 ('Fuel - Shimmer - Single Version', 0.0936585811581694)]

In [69]:
print("Pelles own playlist:")
recommender.named_playlists[pelle_id]

Pelles own playlist:


['Digital Underground - The Humpty Dance',
 'Naughty By Nature - Hip Hop Hooray',
 'Naughty By Nature - O.P.P.',
 'Busta Rhymes - Woo Hah!! Got You All In Check - Explicit LP Version',
 'Black Sheep - The Choice Is Yours (Revisited)',
 'Luke - I Wanna Rock',
 "A Tribe Called Quest - Buggin' Out",
 'Eric B. & Rakim - Know The Ledge',
 'Nas - The World Is Yours',
 'A Tribe Called Quest - Scenario - LP Mix',
 'Gil Scott-Heron - Home Is Where the Hatred Is',
 'Digable Planets - Rebirth of Slick (Cool Like Dat)',
 'Public Enemy - Rebel Without A Pause',
 'Onyx - Slam',
 "Parliament - Let's Play House",
 'AMG - Bitch Betta Have My Money']

## Förklara och visualisera datan
lorem ipsum

## Bygg rekommenderaren

### Item based collaborative filtering
1k playlists would take 18 hours to calculate on a ryzen 3800x......

In [None]:
song_playlist_matrix = np.array(make_item_based_vectors(playlist_vectors, unique_songs))

print("Matrix shape (unique_songs, playlists):",song_playlist_matrix.shape)

In [None]:
def time_taken(t0, t1, dec=2):
    return round(t1-t0, dec)
t0 = time.time()
song_similarities = (cosine_similarity(song_playlist_matrix[1000], pl_vector_j)
                    for pl_vector_j in song_playlist_matrix)

t1 = time.time()
print("Time taken:", time_taken(t0, t1))

In [None]:
sum(song_similarities)

In [None]:
w = (asd for asd in [4,6,7,3,2,4,45,56])
for i, j in enumerate(w):
    print(i, j)

In [None]:
import multiprocessing as mp 
bigboi = np.empty(shape=(song_playlist_matrix.shape[0],song_playlist_matrix.shape[0]))

def do_print(s):
    foo = tuple([cosine_similarity(s[1], pl_vector_j)
                    for pl_vector_j in song_playlist_matrix])
    print("s", s[0], "done")
    return (s[0], foo)


t0 = time.time()
p=mp.Pool(14)
items_user_tuple = p.map(do_print,enumerate(song_playlist_matrix))
t1 = time.time()
print("Time taken:", time_taken(t0, t1))
p.close()
p.join()

In [None]:
sum(items_user_tuple[0][1])

In [None]:
unique_songs[:100]

In [None]:
f = open("items_user_tuple.pickle", "wb")
pickle.dump(items_user_tuple, f)