<a href="https://colab.research.google.com/github/DevpriyaSaini/ML-100/blob/main/Nexus_ml_comp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import time


In [None]:
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
       client_id = "",
      client_secret = ""
    )
)


In [None]:
def get_playlist_tracks(playlist_id):
    tracks = []
    results = sp.playlist_items(playlist_id, limit=100)

    while results:
        for item in results["items"]:
            track = item["track"]
            if track and track["id"]:
                tracks.append(track["id"])
        results = sp.next(results) if results["next"] else None

    return tracks


In [None]:
def get_track_features(track_ids):
    data = []

    for i in range(0, len(track_ids), 50):
        batch = track_ids[i:i+50]
        response = sp.tracks(batch)
        tracks = response["tracks"]

        for t in tracks:

            if t is None:
                continue


            if not t.get("artists"):
                continue

            artist_id = t["artists"][0]["id"]


            try:
                artist = sp.artist(artist_id)
            except:
                continue

            data.append({
                "track_id": t["id"],
                "track_popularity": t["popularity"],
                "duration_ms": t["duration_ms"],
                "explicit": int(t["explicit"]),
                "artist_popularity": artist["popularity"],
                "genres": artist["genres"]
            })

    return pd.DataFrame(data)



In [None]:
def build_features(df, top_genres=None):
    df = df.copy()

    if top_genres is None:
        genre_counts = Counter(g for genres in df.genres for g in genres)
        top_genres = [g for g, _ in genre_counts.most_common(25)]

    for g in top_genres:
        df[f"genre_{g}"] = df.genres.apply(lambda x: int(g in x))

    df["duration_min"] = df["duration_ms"] / 60000
    df["popularity_gap"] = df["artist_popularity"] - df["track_popularity"]

    numeric = ["track_popularity", "artist_popularity", "duration_min", "popularity_gap"]
    scaler = StandardScaler()
    df[numeric] = scaler.fit_transform(df[numeric])

    df = df.drop(columns=["genres", "duration_ms"])
    return df, top_genres


In [None]:
def build_playlist_vector(df):
    return df.drop(columns=["track_id"]).mean().values


In [None]:
def get_candidates(seed_tracks, limit=500):
    candidates = set()

    for tid in seed_tracks[:10]:
        try:
            track = sp.track(tid)
            artist_id = track["artists"][0]["id"]

            # Artist top tracks
            try:
                for t in sp.artist_top_tracks(artist_id)["tracks"]:
                    candidates.add(t["id"])
            except:
                pass

            # Related artists (this is where your error came from)
            try:
                related = sp.artist_related_artists(artist_id)
                for artist in related["artists"][:3]:
                    try:
                        for t in sp.artist_top_tracks(artist["id"])["tracks"]:
                            candidates.add(t["id"])
                    except:
                        continue
            except:
                pass

        except:
            continue

        if len(candidates) >= limit:
            break

    return list(candidates)


In [None]:
def generate_explanation(seed_df, row):
    reasons = []

    # Genre overlap
    seed_genres = set(col.replace("genre_", "") for col in seed_df.columns
                      if col.startswith("genre_") and seed_df[col].mean() > 0.3)

    cand_genres = set(col.replace("genre_", "") for col in row.index
                      if col.startswith("genre_") and row[col] == 1)

    overlap = seed_genres & cand_genres
    if overlap:
        reasons.append(f"Shares genres: {', '.join(list(overlap)[:2])}")

    if abs(row["track_popularity"]) < 0.6:
        reasons.append("Popularity matches your usual taste")

    if abs(row["duration_min"]) < 0.6:
        reasons.append("Similar track length to your playlist")

    explicit_rate = seed_df["explicit"].mean()
    if explicit_rate > 0.6 and row["explicit"] == 1:
        reasons.append("Matches explicit-content preference")
    elif explicit_rate < 0.4 and row["explicit"] == 0:
        reasons.append("Matches clean-content preference")

    if not reasons:
        reasons.append("High overall similarity to playlist profile")

    return " | ".join(reasons)


In [None]:
def rank_candidates(playlist_vector, cand_df):
    X = cand_df.drop(columns=["track_id"]).values
    sims = cosine_similarity([playlist_vector], X)[0]
    cand_df["score"] = sims
    return cand_df.sort_values("score", ascending=False)


In [None]:
def diversify(ranked_df, max_per_artist=2):
    final = []
    artist_count = {}

    for _, row in ranked_df.iterrows():
        track_id = row["track_id"]
        track = sp.track(track_id)
        artist = track["artists"][0]["name"]

        artist_count.setdefault(artist, 0)

        if artist_count[artist] < max_per_artist:
            final.append(row)
            artist_count[artist] += 1

        if len(final) == 10:
            break

    return pd.DataFrame(final)


In [None]:
class SpotifyRecommender:
    def __init__(self, sp):
        self.sp = sp

    def recommend(self, playlist_id):
        start = time.time()

        # Step 1: Get playlist tracks
        seed_tracks = get_playlist_tracks(playlist_id)

        # Step 2: Extract features
        seed_raw = get_track_features(seed_tracks)
        seed_df, top_genres = build_features(seed_raw)

        # Step 3: Build playlist vector
        playlist_vector = build_playlist_vector(seed_df)

        # Step 4: Get candidate tracks
        candidates = [c for c in get_candidates(seed_tracks) if c not in seed_tracks]

        cand_raw = get_track_features(candidates)
        cand_df, _ = build_features(cand_raw, top_genres)

        # Step 5: Rank candidates
        ranked = rank_candidates(playlist_vector, cand_df)

        # Step 6: Diversify top results
        top10 = diversify(ranked, max_per_artist=2)

        # Step 7: Generate final results
        results = []
        for _, row in top10.iterrows():
            results.append({
                "track_id": row["track_id"],
                "score": round(row["score"], 4),
                "explanation": generate_explanation(seed_df, row)
            })

        print(f"Inference time: {round(time.time()-start,2)} seconds")
        return results


In [None]:
model = SpotifyRecommender(sp)
recs = model.recommend("4oYjxsCeGITOx5zsHaK4ny")

for r in recs:
    print(r)


ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/6eUKZXaKkcviH0Ku9w2n3V/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/05etL4pzWd6TSv1x5WrlG3/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/540vIaP2JwjQb9dm3aArA4/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/2L16nDKTxhFGaDriR2AHTB/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/3RPQOyEedGc3ULmDb6cNbT/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/artists/09UmIX92EUH9hAK4bxvHx6/related-artists with Params: {} returned 404 due to Not Found
ERROR:spotipy.client:H

Inference time: 15.28 seconds
{'track_id': '5pvXcMiqTDzUimGj2Vi8uG', 'score': 0.7207, 'explanation': 'Shares genres: desi, hindi pop | Popularity matches your usual taste | Matches clean-content preference'}
{'track_id': '1fcCPXmH4vUzG6UQTTjZp5', 'score': 0.7095, 'explanation': 'Shares genres: desi, hindi pop | Popularity matches your usual taste | Matches clean-content preference'}
{'track_id': '0dUbhFM18NyBDDpiktEQLk', 'score': 0.6908, 'explanation': 'Shares genres: desi, hindi pop | Popularity matches your usual taste | Matches clean-content preference'}
{'track_id': '1wqdo6IYGnS4NYx98cWk1d', 'score': 0.6706, 'explanation': 'Shares genres: desi, hindi pop | Popularity matches your usual taste | Matches clean-content preference'}
{'track_id': '6yrIAJ4yMPzq8Vsu1d3CXg', 'score': 0.6633, 'explanation': 'Shares genres: desi, hindi pop | Popularity matches your usual taste | Matches clean-content preference'}
{'track_id': '6FjbAnaPRPwiP3sciEYctO', 'score': 0.66, 'explanation': 'Shares gen

In [None]:
import pickle
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os

# Create Spotify client (same as you used above)
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id="a42dd2cfe7d1431b8b332d4401cc7f3a",
        client_secret="a60bef68c511412c8ac23379f42cdabc"
    )
)

# Create model object
model = SpotifyRecommender(sp)

# Save (dump) the model
with open("spotify_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully as spotify_model.pkl")


Model saved successfully as spotify_model.pkl
