In [1]:
# Importing libraries 

import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings("ignore")

In [6]:
# Reading in and slimming down the scv file

df = pd.read_csv('app_data\spotify_track_features.csv')

#df = df.sample(n = 500000)

## Doing some EDA

In [7]:
df.shape

(1758, 18)

In [8]:
df.isnull().sum().sum()

0

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_name,track_id,artists,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,M' Manc (con Geolier & Sfera Ebbasta),0MLu1cFYL4ikdRTdasCQT9,"['Shablo', 'Geolier', 'Sfera Ebbasta']","['6hkKbkZGvAXuvle2FhCnxy', '27LlKWxS3KXW7RRAxN...",0.703,0.704,6,-6.117,0,0.0641,0.048,0.0,0.41,0.725,139.933,180000,4
1,1,Indispensabile,6StZfpZWT9IwyfdqqBj5k2,['Davide Rossi'],['0wG1NVcTvpOjGKUnSVqetO'],0.659,0.718,1,-6.075,0,0.0372,0.089,0.0,0.134,0.61,120.018,183843,4
2,2,Scorpione,0vkFcZZXyV8gnZGuCJ442S,"['Donzell*', 'Giuggi Situazione']","['4H866sUs33uCqVx5cIimGy', '6OPbhiNiFbkC2Hy9WN...",0.631,0.643,11,-7.604,1,0.0454,0.755,0.000108,0.0906,0.285,160.029,174000,4
3,3,Nebula,4mfm0COGySDHbPKCJr9jzJ,['Dial'],['0qZWeHRw63u15y7L0u7fyB'],0.457,0.513,10,-12.164,0,0.271,0.266,0.000323,0.108,0.461,124.968,209816,4
4,4,Dreamer nos vies,39ypweDsKFtqN2iTcmtuG5,"['RCZ', 'Dreamy BOY']","['6rR1ycLMVWsVuyms0ZwVjj', '7xjNlyVktCjfAwfLd5...",0.843,0.602,5,-7.643,0,0.36,0.766,0.0,0.108,0.696,120.114,162494,4


In [10]:
df['time_signature'].value_counts()

4    1580
3     140
5      24
1      14
Name: time_signature, dtype: int64

## NLP and creating new columns

In [11]:
df['artists_upd_v1'] = df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))
df['artists_upd_v2'] = df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
df['artists_upd'] = np.where(df['artists_upd_v1'].apply(lambda x: not x), df['artists_upd_v2'], df['artists_upd_v1'] )


In [13]:
#need to create my own song identifier because there are duplicates of the same song with different ids
df['artists_song'] = df.apply(lambda row: row['track_name'] + ' ' + 'by'+ ' ' + row['artists_upd'][0],axis = 1)

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artists_song
0,0,0MLu1cFYL4ikdRTdasCQT9,"['Shablo', 'Geolier', 'Sfera Ebbasta']","['6hkKbkZGvAXuvle2FhCnxy', '27LlKWxS3KXW7RRAxN...",0.703,0.704,6,-6.117,0,0.0641,0.048,0.0,0.41,0.725,139.933,180000,4,M' Manc (con Geolier & Sfera Ebbasta) by Shablo
1,1,6StZfpZWT9IwyfdqqBj5k2,['Davide Rossi'],['0wG1NVcTvpOjGKUnSVqetO'],0.659,0.718,1,-6.075,0,0.0372,0.089,0.0,0.134,0.61,120.018,183843,4,Indispensabile by Davide Rossi
2,2,0vkFcZZXyV8gnZGuCJ442S,"['Donzell*', 'Giuggi Situazione']","['4H866sUs33uCqVx5cIimGy', '6OPbhiNiFbkC2Hy9WN...",0.631,0.643,11,-7.604,1,0.0454,0.755,0.000108,0.0906,0.285,160.029,174000,4,Scorpione by Donzell*
3,3,4mfm0COGySDHbPKCJr9jzJ,['Dial'],['0qZWeHRw63u15y7L0u7fyB'],0.457,0.513,10,-12.164,0,0.271,0.266,0.000323,0.108,0.461,124.968,209816,4,Nebula by Dial
4,4,39ypweDsKFtqN2iTcmtuG5,"['RCZ', 'Dreamy BOY']","['6rR1ycLMVWsVuyms0ZwVjj', '7xjNlyVktCjfAwfLd5...",0.843,0.602,5,-7.643,0,0.36,0.766,0.0,0.108,0.696,120.114,162494,4,Dreamer nos vies by RCZ


In [18]:
df['artists_song'][0]

"M' Manc (con Geolier & Sfera Ebbasta) by Shablo"

In [27]:
# Reduce columns to 13 to match the song features that Spotify API will return to us plus column with Song and Artist name 
cols_to_drop = [
        "Unnamed:",
        "track_id",
        "artists",
        "artist_id",
        "track_name",
        "artists_upd",
        "artists_upd_v2",
        "artists_upd_v1",
        
    ]
df = df.drop(cols_to_drop, axis=1)

KeyError: "['Unnamed:' 'track_name' 'artists_upd' 'artists_upd_v2' 'artists_upd_v1'] not found in axis"

In [23]:
df.shape

(1758, 18)

In [31]:
# DF with song features that Spotify API will return
features = df.drop(columns='artists_song')

In [32]:
features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
879643,0.653,0.7430,8,-11.351,1,0.2080,0.55500,0.000036,0.118,0.8160,78.343,319093,4.0
384519,0.396,0.1670,7,-16.559,1,0.0436,0.94600,0.262000,0.267,0.4870,90.502,306600,4.0
185840,0.490,0.9890,0,-2.207,1,0.1130,0.05810,0.000000,0.253,0.6580,103.550,121240,4.0
804809,0.545,0.5100,0,-9.776,1,0.0269,0.70400,0.041500,0.165,0.0818,97.026,200550,4.0
901611,0.497,0.5410,0,-11.856,0,0.0592,0.73300,0.043200,0.148,0.2820,160.863,202497,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489437,0.645,0.6380,0,-6.711,1,0.0249,0.05770,0.002710,0.176,0.5010,128.911,311231,4.0
790945,0.762,0.8620,0,-7.458,1,0.0337,0.42000,0.037500,0.180,0.9200,128.182,220880,4.0
978858,0.423,0.9900,10,-5.843,1,0.1530,0.00785,0.000375,0.104,0.2320,97.171,422880,4.0
224852,0.149,0.0378,1,-26.347,1,0.0435,0.87900,0.880000,0.104,0.0328,86.375,601560,4.0


In [33]:
# creating df with indexes and song by artist columns 
names = df[['artists_song']]

In [34]:
# Creating and fitting NearestNeighbors model
model = NearestNeighbors(n_neighbors=10, algorithm='brute')
model.fit(features)


NearestNeighbors(algorithm='brute', n_neighbors=10)

In [38]:
# indexes of recommended songs basen on features of the first song in our dataset
n_dist, n_ind = model.kneighbors(features.head(1))
ind = list(n_ind[0])

In [41]:
# 5 recommended songs
id_list = [1, 2, 3, 4, 5]
artist_song = []

for each in ind:
    artist_song.append(names.iloc[each]["artists_song"])

recommendations = list(zip(id_list, artist_song))

recommendations

[(1, 'Bailando by Los Sabrosos Del Merengue'),
 (2, 'White Boots Marching In A Yellow Land by Eric Andersen'),
 (3, 'White Boots Marching In A Yellow Land by Eric Andersen'),
 (4, 'El Soul by Sean Jones'),
 (5, 'UmbraStory by Longing For Orpheus')]

In [42]:
# Creating a csv
names.to_csv(r'song_artist.csv')

In [43]:
# Saving our trained model
pickle.dump(model, open('Spotify_model_new', 'wb'))