# **Using ML to Recommend Songs on Spotify**

# Imports

*Modules*

In [1]:
# For data handling
import pandas as pd

# ML stuff
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import neighbors

# To create and upload the playlist
import spotipy
from spotipy.oauth2 import SpotifyOAuth

*Reading in the data*

In [2]:
artists = pd.read_pickle("./top_artists.pkl")
tracks = pd.read_pickle("./top_tracks.pkl")
playlists = pd.read_pickle("./playlists.pkl")
recs = pd.read_pickle("./recommendations.pkl")

# Tidying the data

*First, let's remove any duplicate tracks within any of the data frames*


In [3]:
tracks = tracks.drop_duplicates(subset = "id", keep = "last")
tracks = tracks.reset_index()

playlists = playlists.drop_duplicates(subset = "id", keep = "last")
playlists = playlists.reset_index()

recs = recs.drop_duplicates(subset = "id", keep = "last")
recs = recs.reset_index()

*I noticed that some recommendations are duplicates of songs I already have in my playlists or top tracks, so let's remove those*

In [4]:
recs = recs[-recs["id"].isin([playlists["id"], tracks["id"]])]

*I also noticed that some of the playlists were kind of outdated, so I'll just filter them to include the ones I current listen to*

In [5]:
important_playlists = [
    "Tour Dates", "Biting the Bottleneck", "Don't forget the bouquet!", "15mg", "Market Watch",
    "The Iceman !", "Sm99thie king", "Driving down eldridge", "Frank Ocean Mix", "Tyler, The Creator Mix"
]
playlists = playlists[playlists["playlist_name"].isin(important_playlists)]

# Creating a rating system

*Because tracks are ambiguous by themselves, I am going to rate them by how similar they are to my top tracks and whether or not my top artists made them*

In [6]:
LABELS = ["disc_number", "track_number", "album_tracks", "popularity", "index", "duration_ms", "time_signature", "key"]

In [7]:
# Finding the averages for the important audio features of my top tracks
avg = tracks.select_dtypes(include = ["int64", "float64"])
avg = avg.drop(labels = LABELS, axis = 1)

# Normalize tempo and loudness to knot blow up size
avg["tempo"] = avg["tempo"].apply(lambda x: x / max(avg["tempo"]))
avg["loudness"] = avg["loudness"].apply(lambda x: x / min(avg["loudness"]))

avg_mean = avg.mean()
avg_mean

danceability        0.585700
energy              0.515117
loudness            0.424968
mode                0.650000
speechiness         0.124242
acousticness        0.406250
instrumentalness    0.082119
liveness            0.164120
valence             0.430477
tempo               0.630910
dtype: float64

In [8]:
# Comparing the above averages to each song in my playlist tracks
compare = playlists.select_dtypes(include = ["int64", "float64"])
compare = compare.drop(labels = LABELS + ["playlist_tracks"], axis = 1)

# Normalize tempo and loudness to knot blow up size
compare["tempo"] = compare["tempo"].apply(lambda x: x / max(compare["tempo"]))
compare["loudness"] = compare["loudness"].apply(lambda x: x / min(compare["loudness"]))

# Create the rating column
compare["rating"] = (
    (compare["danceability"] - avg_mean["danceability"]) ** 2 + \
    (compare["energy"] - avg_mean["energy"]) ** 2 + \
    (compare["loudness"] - avg_mean["loudness"]) ** 2 + \
    (compare["mode"] - avg_mean["mode"]) ** 2 + \
    (compare["speechiness"] - avg_mean["speechiness"]) ** 2 + \
    (compare["acousticness"] - avg_mean["acousticness"]) ** 2 + \
    (compare["instrumentalness"] - avg_mean["instrumentalness"]) ** 2 + \
    (compare["liveness"] - avg_mean["liveness"]) ** 2 + \
    (compare["valence"] - avg_mean["valence"]) ** 2 + \
    (compare["tempo"] - avg_mean["tempo"]) ** 2
)

playlists["rating"] = compare["rating"]

# Add a constant (0.2) if song is made by an artist that is in my top artists
for idx, row in playlists.iterrows():

    if artists["id"].eq(row["artist_id"]).any():
        row["rating"] += 0.2


# Normalize the ratings
hi = playlists["rating"].max()
playlists["rating"] = playlists["rating"].apply(lambda x: x / hi)

playlists.head()

Unnamed: 0,index,id,name,popularity,type,is_local,explicit,duration_ms,disc_number,track_number,...,acousticness,instrumentalness,liveness,valence,tempo,uri,track_href,analysis_url,time_signature,rating
0,2,2eJOf3EuOi7GueeHfUnkvg,Genesis 1:1,52,audio_features,False,True,180010,1,1,...,0.368,0.0,0.286,0.207,193.968,spotify:track:2eJOf3EuOi7GueeHfUnkvg,https://api.spotify.com/v1/tracks/2eJOf3EuOi7G...,https://api.spotify.com/v1/audio-analysis/2eJO...,4,0.21444
1,4,5BsigwhOMHxCkF6ntEhUQ5,how u feel?,72,audio_features,False,True,233795,1,1,...,0.000687,0.0,0.0973,0.208,132.076,spotify:track:5BsigwhOMHxCkF6ntEhUQ5,https://api.spotify.com/v1/tracks/5BsigwhOMHxC...,https://api.spotify.com/v1/audio-analysis/5Bsi...,4,0.434191
2,5,2TmqHjg7uhizGndzXQdFuf,Be Nice 2 Me,64,audio_features,False,True,127792,1,4,...,0.0732,9e-06,0.482,0.331,139.017,spotify:track:2TmqHjg7uhizGndzXQdFuf,https://api.spotify.com/v1/tracks/2TmqHjg7uhiz...,https://api.spotify.com/v1/audio-analysis/2Tmq...,4,0.251835
3,6,59U7bktlujuNRW7fBq1VDt,JETLGGD,63,audio_features,False,True,116527,1,1,...,0.000516,0.0,0.315,0.346,139.996,spotify:track:59U7bktlujuNRW7fBq1VDt,https://api.spotify.com/v1/tracks/59U7bktlujuN...,https://api.spotify.com/v1/audio-analysis/59U7...,4,0.458261
4,13,610gzNqwaSz89u6YIpDlyZ,NEVEREVER,69,audio_features,False,True,144000,1,24,...,0.0452,0.0,0.112,0.326,140.115,spotify:track:610gzNqwaSz89u6YIpDlyZ,https://api.spotify.com/v1/tracks/610gzNqwaSz8...,https://api.spotify.com/v1/audio-analysis/610g...,4,0.221075


# Final preprocessing

*Next, let's ensure consistency in the data*

In [9]:
# Create (x, y) pairs
X = playlists[[
        "popularity", "duration_ms", "energy", "acousticness", "speechiness",
        "instrumentalness", "valence", "danceability", "loudness",
        "key", "liveness", "tempo", "time_signature"
    ]]
y = playlists["rating"]
 
# Handle NA values
X = X.dropna()
recs = recs.dropna()

X_recs = recs.copy()
X_recs = X_recs[[
        "popularity", "duration_ms", "energy", "acousticness", "speechiness",
        "instrumentalness", "valence", "danceability", "loudness",
        "key", "liveness", "tempo", "time_signature"
]]

*Now, let's create the training/testing split*

In [10]:
TEST_SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = 42)
X_train.head()

Unnamed: 0,popularity,duration_ms,energy,acousticness,speechiness,instrumentalness,valence,danceability,loudness,key,liveness,tempo,time_signature
148,75,354692,0.608,0.169,0.0321,0.916,0.18,0.576,-13.537,9,0.106,150.9,4
206,33,168182,0.229,0.816,0.0853,0.0035,0.346,0.759,-14.292,2,0.11,136.912,4
231,66,151286,0.726,0.00893,0.0655,4e-06,0.731,0.73,-4.865,2,0.255,139.938,4
327,70,262680,0.558,0.227,0.179,0.0,0.8,0.479,-6.074,11,0.0808,83.465,4
157,80,175961,0.0167,0.979,0.115,2.7e-05,0.229,0.509,-24.639,9,0.387,207.97,3


# Nearest Neighbors Approach

In [11]:
# Determine the best k to use
knn = neighbors.KNeighborsRegressor()

param_grid = {"n_neighbors": list(range(1, 10))}
gcv = GridSearchCV(knn, param_grid, cv = 5)
gcv.fit(X_train, y_train)
gcv.best_params_


{'n_neighbors': 7}

In [12]:
# Predict for recommendation data
pred = gcv.predict(X_recs)
recs["ratings"] = pred
recs.head()

Unnamed: 0,index,id,name,popularity,type,is_local,explicit,duration_ms,disc_number,track_number,...,acousticness,instrumentalness,liveness,valence,tempo,uri,track_href,analysis_url,time_signature,ratings
0,1,5m67AzS4PT5ETzdvul8JSu,LIGHTSPEED,48,audio_features,False,True,138792,1,1,...,0.489,2e-06,0.0905,0.649,76.551,spotify:track:5m67AzS4PT5ETzdvul8JSu,https://api.spotify.com/v1/tracks/5m67AzS4PT5E...,https://api.spotify.com/v1/audio-analysis/5m67...,4,0.425991
1,3,27o0xlsmaKZvyLFPZQRu99,By Myself (feat. Destroy Lonely),53,audio_features,False,True,204719,1,4,...,0.393,0.0,0.252,0.398,160.06,spotify:track:27o0xlsmaKZvyLFPZQRu99,https://api.spotify.com/v1/tracks/27o0xlsmaKZv...,https://api.spotify.com/v1/audio-analysis/27o0...,4,0.5258
2,4,7DIr9eyABIIBG3JvW5YSJw,Yes,36,audio_features,False,True,138436,1,1,...,0.00456,2e-06,0.186,0.0691,124.934,spotify:track:7DIr9eyABIIBG3JvW5YSJw,https://api.spotify.com/v1/tracks/7DIr9eyABIIB...,https://api.spotify.com/v1/audio-analysis/7DIr...,4,0.425991
3,5,7KZATGZ8t92wb4aixhN9hY,Pancake,59,audio_features,False,True,205760,1,2,...,0.204,0.0,0.109,0.67,140.006,spotify:track:7KZATGZ8t92wb4aixhN9hY,https://api.spotify.com/v1/tracks/7KZATGZ8t92w...,https://api.spotify.com/v1/audio-analysis/7KZA...,4,0.533441
4,6,4BFNUONcAch7fTSQXvGLNq,Bounce Back,59,audio_features,False,True,95478,1,1,...,0.0849,2e-06,0.134,0.363,90.478,spotify:track:4BFNUONcAch7fTSQXvGLNq,https://api.spotify.com/v1/tracks/4BFNUONcAch7...,https://api.spotify.com/v1/audio-analysis/4BFN...,4,0.538513


# Generating the playlist

In [13]:
# Rating above 0.55 leads to 31 tracks, which I think is a good amount
# Keep in mind, outliers were punished hard earlier
add = recs[recs["ratings"] > 0.55]["id"]

In [14]:
# Connecting to spotify and making the playlist

# Authorization step
CLIENT_ID = "..."
CLIENT_SECRET = "..."
REDIRECT_URL = "http://localhost:9001/callback"
SCOPE = "playlist-modify-private"

sp = spotipy.Spotify(auth_manager = SpotifyOAuth(
  client_id = CLIENT_ID,
  client_secret = CLIENT_SECRET,
  redirect_uri = REDIRECT_URL,
  scope = SCOPE 
))

# Initialize the playlist
pl = sp.user_playlist_create(
    user = "dooooooonut",
    name = "AI Recommendations",
    public = False,
    collaborative = False,
    description = "Hopefully this is good"
)

# Add the tracks
for id in add:
    sp.user_playlist_add_tracks(
        user = "dooooooonut",
        playlist_id = pl["id"],
        tracks = [id]
    )