# Playlist recommender using Locality-Sensitive Hashing

In [None]:
import pandas as pd
from datasketch import MinHash
import numpy as np
from math import log2
import random

##  Load train data

In [None]:

df = pd.read_csv("train.csv")
df.head()


Unnamed: 0,playlist_id,playlist_name,artist_id,artist_name,track_id,track_name,album_id,album_name,duration_ms,position,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,461206,Miranda Lambert,66lH4jAE7pqPlOlzUKbwA0,Miranda Lambert,4Gyhy413uPALzaVg4S1DpX,Keeper of the Flame,563h536tB6n8Dn62jr4RZG,The Weight of These Wings,239733.0,95.0,...,6.0,-6.372,1.0,0.0361,0.45,2.3e-05,0.13,0.337,116.567,4.0
1,545224,mhm,3zLWOB0I86EiVgG5NrX1ht,Jarrod Alonge,3jKrgzpPb23ZmEIAipUKnZ,"Hey Jarrod, What's That Song Again?",0XFMnZrWpEYMVRrBdHmsGZ,Beating a Dead Horse: Deluxe Ultra-Limited Exc...,255185.0,4.0,...,,,,,,,,,,
2,0dMexqq0XIWS3QJ74z3ZhD,Hip Hop 2000s Music - Best Hip Hop Hits of the...,,,5dL5jv5GSCRoDhTtnY8maL,Mesmerize,,,,,...,,,,,,,,,,
3,279103,litty,46SHBwWsqBkxI7EeeBEQG7,Kodak Black,34oWbFBfGEElvgO0a5c9V4,No Flockin,12YTH28wiBXQ16gvWOCMLU,No Flockin,165290.0,18.0,...,5.0,-8.372,0.0,0.191,0.0673,0.0,0.0839,0.815,117.532,4.0
4,1WH6WVBwPBz35ZbWsgCpgr,Top Pop Hits 2015-2025,,"Topic, A7S",3H7ihDc1dqLriiWXwsc2po,Breaking Me,,,166794.0,,...,8.0,-5.652,0.0,0.218,0.223,0.0,0.129,0.664,122.031,4.0


### Group tracks by playlist

To work with playlists as sets, we group all track IDs belonging to the same playlist. 
Each playlist is therefore represented as a Python set, where duplicate tracks are removed automatically.  

In [73]:

playlist_dict = (
    df.groupby("playlist_id")["track_id"]
      .apply(set)
      .to_dict()
)


sizes = {pid: len(s) for pid, s in playlist_dict.items()}
min(sizes.values()), max(sizes.values())

(1, 439)

## MinHash signatures 

Each playlist is first transformed into a MinHash signature—a compact representation that preserves similarity.  
Playlists with similar signatures have a high probability of being placed in the same LSH buckets, allowing us to efficiently retrieve likely neighbors without scanning the full dataset.

In this notebook, we use datasketch’s MinHashLSH implementation.

In [None]:

def create_minhash(s, num_perm=512):
    m = MinHash(num_perm=num_perm)
    for track in s:
        m.update(str(track).encode("utf8"))
    return m

minhash_dict = {pid: create_minhash(tracks) 
                for pid, tracks in playlist_dict.items()}


### Locality-Sensitive Hashing (LSH) to find neighbors fast

Once we can retrieve similar playlists through LSH, we generate recommendations by aggregating tracks from the nearest neighbors.  
The intuition is that playlists that share many songs with a query playlist likely contain additional relevant tracks.

In [None]:
from datasketch import MinHashLSH

lsh = MinHashLSH(threshold=0.02, num_perm=512)

for pid, mh in minhash_dict.items():
    lsh.insert(str(pid), mh)


#  Query LSH for similar playlists for testing
query_pid = list(playlist_dict.keys())[0]
query_mh = minhash_dict[query_pid]

similar_playlists = lsh.query(query_mh)
similar_playlists



### Generating recommendations

Once we can retrieve similar playlists through LSH, we generate recommendations by aggregating tracks from the nearest neighbors.  
The intuition is that playlists that share many songs with a query playlist likely contain additional relevant tracks.

For a given playlist:
1. LSH retrieves a set of similar playlists.
2. Tracks from these neighbors are scored based on their overlap with the query playlist.
3. Tracks already present in the playlist are removed.
4. The highest-scoring tracks are returned as recommendations.

This simple neighborhood-based strategy provides a fast and effective baseline recommender system.

In [None]:

def jaccard_similarity(set1, set2):
    if not set1 and not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)


def recommend(pid, top_k=10, top_neighbors=200):
    """
    Recommend tracks for a given playlist ID using Jaccard similarity for scoring.
    pid should be a string if your keys are strings.
    """

    # get MinHash signature for the query playlist
    mh = minhash_dict[pid]

    # Retrieve candidate neighbors from LSH buckets
    candidates = lsh.query(mh)

    # Remove itself if present
    candidates = [c for c in candidates if c != pid]

    # Visible tracks of query playlist (if testing)
    # or full playlist if training
    query_tracks = playlist_dict[pid]

    # ---- Score neighbors by Jaccard similarity ----
    neighbor_scores = []
    for c in candidates:
        sim = jaccard_similarity(query_tracks, playlist_dict[c])
        neighbor_scores.append((sim, c))

    # Sort neighbors by similarity
    neighbor_scores.sort(reverse=True)

    # Keep top-N most similar neighbor playlists
    top_neighbors = neighbor_scores[:top_neighbors]

    # ---- Aggregate track recommendations ----
    scores = {}
    for sim, nbr in top_neighbors:
        for track in playlist_dict[nbr]:
            if track not in query_tracks:
                scores[track] = scores.get(track, 0) + sim

    # Return top-K highest scoring tracks
    ranked = sorted(scores, key=scores.get, reverse=True)
    return ranked[:top_k]



recommendations = recommend(query_pid)
recommendations

['2lfPecqFbH8X4lHSpTxt8l',
 '2D6VbgcGFSBhhZJHbVAN1Q',
 '4gnvnnm2oYy8IsUZ104wOS',
 '68pL3hokJXKJ1vJSaEhKr7',
 '6Ehy8wRUYjAZcMMRCxMZVM',
 '2NvyKQtbrTYyiCHnmYrBRa',
 '7vGuf3Y35N4wmASOKLUVVU']

In [97]:
def print_playlist_and_recommendations(pid, top_k=10):
    # --- 1. Print current playlist tracks ---
    print(f"\n=== Tracks in Playlist {pid} ===")
    
    playlist_tracks = playlist_dict[pid]
    
    playlist_df = (
        df[df['track_id'].isin(playlist_tracks)]
        [['track_id', 'track_name', 'artist_name']]
        .drop_duplicates()
    )

    for _, row in playlist_df.iterrows():
        print(f"• {row['track_name']} — {row['artist_name']}")

    # --- 2. Compute recommendations ---
    rec_ids = recommend(pid, top_k=top_k)

    print(f"\n=== Recommended Tracks for Playlist {pid} ===")

    if len(rec_ids) == 0:
        print("No recommendations found.")
        return
    
    rec_df = (
        df[df['track_id'].isin(rec_ids)]
        [['track_id', 'track_name', 'artist_name']]
        .drop_duplicates()
    )

    # --- 3. Print recommended tracks ---
    for _, row in rec_df.iterrows():
        print(f"• {row['track_name']} — {row['artist_name']}")

query_pid = list(playlist_dict.keys())[2]
print_playlist_and_recommendations(query_pid, top_k=10)



=== Tracks in Playlist 101861 ===
• Summer — Marshmello
• Where Are Ü Now (with Justin Bieber) - Marshmello Remix — Jack Ü
• Alone — Marshmello

=== Recommended Tracks for Playlist 101861 ===
• Moving On — Marshmello
• Silence — Marshmello
• Home — Marshmello
• Alarm - Marshmello Remix — Anne-Marie
• Fade — Alan Walker
• Waiting For Love - Marshmello Remix — Avicii
• Take It Back — Marshmello


### Evaluation 

To evaluate the effectiveness of the recommendation system, we use a common methodology:  
For each test playlist, we hide a portion of the tracks (treated as ground truth) and let the model predict them based only on the visible tracks.

We compute:
- **Precision@k:** How many recommended tracks are correct.
- **Recall@k:** How many of the hidden tracks were recovered.
- **MAP@k:** How well the model ranks relevant tracks.
- **NDCG@k:** Whether relevant tracks appear near the top of the recommendation list.

These metrics allow us to assess both retrieval quality and ranking performance.  
This evaluation setup is consistent with standard recommender system benchmarks, including the Spotify Million Playlist Dataset Challenge.


In [98]:


# ----------------------------
# 1. LOAD TEST DATA
# ----------------------------
test_df = pd.read_csv("test.csv")

# Group tracks for each test playlist
playlist_tracks_test = (
    test_df.groupby("playlist_id")["track_id"]
    .apply(list)
    .to_dict()
)

# ----------------------------
# 2. SPLIT INTO VISIBLE + HIDDEN
# ----------------------------
visible_test = {}
hidden_test = {}

hide_ratio = 0.25   # hide 25% of tracks

for pid, tracks in playlist_tracks_test.items():
    if len(tracks) < 3:
        continue
    n_hide = max(1, int(len(tracks) * hide_ratio))

    hidden = set(random.sample(tracks, n_hide))
    visible = set(tracks) - hidden

    hidden_test[pid] = hidden
    visible_test[pid] = visible

test_pids = list(visible_test.keys())

# ----------------------------
# 3. CREATE MINHASH SIGNATURE FOR TEST PLAYLISTS
# ----------------------------

def create_minhash_from_tracks(track_set):
    m = MinHash(num_perm=512)
    for t in track_set:
        m.update(str(t).encode("utf8"))
    return m

minhash_test = {
    pid: create_minhash_from_tracks(visible)
    for pid, visible in visible_test.items()
}

# ----------------------------
# 4. RECOMMENDER USING TEST SIGNATURE
# ----------------------------
def recommend_from_visible(pid, top_k=10, top_neighbors=200):
    mh = minhash_test[pid]
    candidates = lsh.query(mh)          # Use TRAINING LSH index only

    visible = visible_test[pid]

    # ---- Score neighbors using exact Jaccard similarity ----
    neighbor_scores = []
    for c in candidates:
        base = playlist_dict[c]         # training playlist
        sim = jaccard_similarity(visible, base)
        neighbor_scores.append((sim, c))

    # sort by Jaccard similarity (descending)
    neighbor_scores.sort(reverse=True)

    # use top-N neighbors for recommendations
    top_neighbors = neighbor_scores[:top_neighbors]

    # ---- Aggregate track scores ----
    scores = {}
    for sim, c in top_neighbors:
        for track in playlist_dict[c]:
            if track not in visible:
                scores[track] = scores.get(track, 0) + sim

    # return top-k non-seen tracks
    ranked = sorted(scores, key=scores.get, reverse=True)
    return ranked[:top_k]

# ----------------------------
# 5. METRIC FUNCTIONS (SAFE)
# ----------------------------
def precision_at_k(rec, gt, k):
    return len(set(rec[:k]) & gt) / k

def recall_at_k(rec, gt, k):
    return len(set(rec[:k]) & gt) / len(gt)

def average_precision(rec, gt, k):
    hits = 0
    score = 0
    for i, track in enumerate(rec[:k], start=1):
        if track in gt:
            hits += 1
            score += hits / i
    return score / min(len(gt), k)

def ndcg_at_k(rec, gt, k):
    dcg = 0
    for i, track in enumerate(rec[:k], start=1):
        if track in gt:
            dcg += 1 / log2(i + 1)
    ideal = min(len(gt), k)
    idcg = sum(1 / log2(i + 1) for i in range(1, ideal + 1))
    return dcg / idcg

# ----------------------------
# 6. EVALUATE ALL TEST PLAYLISTS
# ----------------------------
K = 10

precisions = []
recalls = []
maps = []
ndcgs = []

for pid in test_pids:
    recs = recommend_from_visible(pid, top_k=K)
    gt = hidden_test[pid]

    precisions.append(precision_at_k(recs, gt, K))
    recalls.append(recall_at_k(recs, gt, K))
    maps.append(average_precision(recs, gt, K))
    ndcgs.append(ndcg_at_k(recs, gt, K))

# ----------------------------
# 7. PRINT RESULTS
# ----------------------------
print("\n=== Evaluation Results (k=10) ===")
print(f"Precision@10: {np.mean(precisions):.4f}")
print(f"Recall@10:    {np.mean(recalls):.4f}")
print(f"MAP@10:       {np.mean(maps):.4f}")
print(f"NDCG@10:      {np.mean(ndcgs):.4f}")



=== Evaluation Results (k=10) ===
Precision@10: 0.0047
Recall@10:    0.0372
MAP@10:       0.0136
NDCG@10:      0.0200
