### Load Libraries

In [56]:
import pandas as pd
import numpy as np
import sys
import sqlite3

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree
from sklearn.neighbors import NearestNeighbors

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

from dotenv import load_dotenv
import os

### Load databased

In [3]:
df = pd.read_csv('data/df_prepare.csv')

In [5]:
print(df.shape)
df.head()

(180223, 17)


Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,2,0.346,-1.828,1,0.0525,166.969,4,0.814
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,5,0.151,-5.559,0,0.0868,174.003,4,0.816
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,2,0.103,-13.879,0,0.0362,99.488,5,0.368
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,2,0.0985,-12.178,1,0.0395,171.758,4,0.227
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,5,0.202,-21.15,1,0.0456,140.576,4,0.39


### Look at the 5 nearest songs of all dataset

In [20]:
"""
Make a Function to find nearest song to existing database.
"""
def find_nearest_songs(df, number_of_songs):
    # use number of desired songs
    songs = 5
    # remove categoricals
    df_numerics =  df.drop(columns=['track_id', 'track_name', 'artist_name'])
    
    # Scale Data To Cluster More Accurately, and fit clustering model
    df_scaled = StandardScaler().fit_transform(df_numerics)
    df_modeled = KDTree(df_scaled)
    
    # Querying the model for the 5 Nearest Neighbors
    dist, ind = df_modeled.query(df_scaled, k=(songs+1)) # can make a bigger or smaller number
    
    # Putting the Results into a Dataframe
    dist_df = pd.DataFrame(dist)
    
    # Calculating the Distances
    scores = (1 - ((dist - dist.min()) / (dist.max() - dist.min()))) * 100
    
    # Creating A New Dataframe for the Distances
    columns = ['Searched_Song', 'Nearest_Song1', 'Nearest_Song2', 'Nearest_Song3', 'Nearest_Song4',
               'Nearest_Song5']
    
    dist_score = pd.DataFrame(scores.tolist(), columns = columns)
    
    # An Array of all indices of the nearest neighbors
    ind[:(songs+1)]
    
    # Making an array of the Track IDs
    song_ids = np.array(df.track_id)
    
    # A function that creates list of the each song with its nearest neighbors
    def find_similars(song_ids, ind):
        similars = []
        for row in ind:
            ids = [song_ids[i] for i in row]
            similars.append(ids)

        return similars 
    
    # using the above function
    nearest_neighbors = find_similars(song_ids, ind)
    
    # putting the results into a dataframe
    nearest_neighbors_df = pd.DataFrame(nearest_neighbors, columns=columns)
    
    return nearest_neighbors_df

In [21]:
# this takes a while to process
predicted = find_nearest_songs(df, 5)

In [26]:
predicted.head()

Unnamed: 0,Searched_Song,Nearest_Song1,Nearest_Song2,Nearest_Song3,Nearest_Song4,Nearest_Song5
0,0BRjO6ga9RKCKjfDqeFgWV,4qYvjzDgJQF7KphwFAlKlZ,4TCwm3oy4zIMFtBZguvF7J,3lEipzxuebyFlWe0OIbuR0,2VUFGKhLhfvqfWBgybcAnw,629Zo1tJvHVOFi8zvcmNqG
1,0BjC1NfoEOOusryehmNudP,1ntTfPXSUtzgPTSkLzxOFj,2EuLA7E5JCDxIpzdetGNik,341VzaGw3ldJwt7772pCwc,1rAFp6zSKI0sOnaEyzA4iY,1e5iT6ObKpbYb60lcrU6Lm
2,0CoSDzoNIKCRs124s9uTVy,4AKtZbuhxQ619802YUp302,3tILEcYwnSGGfhxgM2ShjV,3fxFMiYZumjDSwbPNUzjEn,3KNFjZZ2VjCA2YNYpkKZp0,2E7PbddVqiEuv1wEpFDjeh
3,0Gc6TVm52BwZD07Ki6tIvf,2Amnak31JcaMHfctAcQ80q,3gLXQYeRyFfsuvdCf1S1kw,0mA0xdshrfu1Tjq9dBjcjH,4UlpQlYSKB8zxpSm8XgSZW,3gIdeoFGQ5DRC8rPTXHBU5
4,0IuslXpMROHdEPvSl1fTQK,1XCd9bMw7DDpQCEVzCXpHF,6dyU17xMUusa3osHJTiipU,3EUjJmrfhUyDKC6PxAoXcT,2qcCtLs9NjDMElmkB0xQ0A,0bz6RRExXd1l6kCHGXMuBn


#### Save predicted dataset for later use

### save CSV

In [27]:
# save to csv
predicted.to_csv('predicted.csv', index_label=False)

### save JSON

In [50]:
# save to json
def save_data_frame_as_json(df=None, filename=None, orient="records"):
    """
    Saves data frame to JSON format
    Parameters
    ----------
    df: Pandas DataFrame
    filename: File path or name
    Returns
    -------
    JSON file
    """
    try:
        if not filename.endswith('.json'):
            filename += '.json'
        df.to_json(filename, orient=orient)
        print(f"Data Frame saved @:{filename}")
    except Exception as e:
        print("Data Frame couldn't be saved: ", sys.exc_info()[0])
        raise

In [51]:
save_data_frame_as_json(predicted, 'data/predicted_df.json', orient="records")

Data Frame saved @:data/predicted_df.json


In [53]:
# check the successful data record
json_df = pd.read_json('data/predicted_df.json')
json_df.head()

Unnamed: 0,Searched_Song,Nearest_Song1,Nearest_Song2,Nearest_Song3,Nearest_Song4,Nearest_Song5
0,0BRjO6ga9RKCKjfDqeFgWV,4qYvjzDgJQF7KphwFAlKlZ,4TCwm3oy4zIMFtBZguvF7J,3lEipzxuebyFlWe0OIbuR0,2VUFGKhLhfvqfWBgybcAnw,629Zo1tJvHVOFi8zvcmNqG
1,0BjC1NfoEOOusryehmNudP,1ntTfPXSUtzgPTSkLzxOFj,2EuLA7E5JCDxIpzdetGNik,341VzaGw3ldJwt7772pCwc,1rAFp6zSKI0sOnaEyzA4iY,1e5iT6ObKpbYb60lcrU6Lm
2,0CoSDzoNIKCRs124s9uTVy,4AKtZbuhxQ619802YUp302,3tILEcYwnSGGfhxgM2ShjV,3fxFMiYZumjDSwbPNUzjEn,3KNFjZZ2VjCA2YNYpkKZp0,2E7PbddVqiEuv1wEpFDjeh
3,0Gc6TVm52BwZD07Ki6tIvf,2Amnak31JcaMHfctAcQ80q,3gLXQYeRyFfsuvdCf1S1kw,0mA0xdshrfu1Tjq9dBjcjH,4UlpQlYSKB8zxpSm8XgSZW,3gIdeoFGQ5DRC8rPTXHBU5
4,0IuslXpMROHdEPvSl1fTQK,1XCd9bMw7DDpQCEVzCXpHF,6dyU17xMUusa3osHJTiipU,3EUjJmrfhUyDKC6PxAoXcT,2qcCtLs9NjDMElmkB0xQ0A,0bz6RRExXd1l6kCHGXMuBn


### save SQL db

In [57]:
# Prepare for the df to sql transfer
connection = sqlite3.connect('data/predicted_db.sqlite3')
curs = connection.cursor()
table_name = 'recommendations'

In [58]:
json_df.to_sql(table_name, con=connection, if_exists = 'replace')

In [62]:
# check the successful data record
curs.execute(f"SELECT count(distinct Searched_Song) as review_count FROM {table_name};")
results = curs.fetchone()
print(results, "RECORDS")

(180135,) RECORDS


## Function for finding similar songs using Spotify API

In [31]:
load_dotenv()

True

### Load spotipy credentials

In [32]:
client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Run the function

In [85]:
# Function for finding similar songs using Spotify API

def dj_rec(track_id, max_distance=6.5, neighbors=3):
    """
    Prints the ids of relevant songs, along with their distance from the input song.

    Parameters:
    track_id (string): Spotify track id.
    max_distance (float): maximum euclidean distance a song can be 
                          from the input song for it to be returned.
    neighbors (int): number of song recommendations returned.
    """

    rel_artists = sp.artist_related_artists(sp.track(track_id=track_id)['artists'][0]['id'])['artists']
    artist_log = []
    for a in rel_artists:
        artist_log.append(a['id'])
    feat_log = []
    for artist in artist_log:
        for track in sp.artist_top_tracks(artist)['tracks']:
            feat_log.append(sp.audio_features(track['id'])[0])
    
    catalog = pd.DataFrame.from_dict(feat_log)
    
    root = pd.DataFrame.from_dict(sp.audio_features(tracks=[track_id]))

    merged_df = root.append(catalog, ignore_index=True)
    
    dropped_df = merged_df.drop(columns=['uri', 'track_href', 'id', 'duration_ms', 'time_signature', 'mode', 'loudness', 'type', 'analysis_url'])
    scaled_df = StandardScaler().fit_transform(dropped_df)
    trans_array = scaled_df.copy()
    trans_array[:,0] = [u*2.4 for u in trans_array[:,0]] # acousticness
    trans_array[:,1] = [((u*u)**0.5)*u for u in trans_array[:,1]] # danceability
    trans_array[:,2] = [u*1.7 for u in trans_array[:,2]] # energy
    trans_array[:,3] = [u*1.4 for u in trans_array[:,3]] # instrumentalness
    trans_array[:,4] = [u*0.9 for u in trans_array[:,4]] # key
    trans_array[:,5] = [u*1.0 for u in trans_array[:,5]] # liveness
    trans_array[:,6] = [u*1.0 for u in trans_array[:,6]] # speechiness
    trans_array[:,7] = [u*1.1 for u in trans_array[:,7]] # tempo
    trans_array[:,8] = [u*2.5 for u in trans_array[:,8]] # valence

    knn = NearestNeighbors()
    knn.fit(trans_array)

    rec = knn.kneighbors(trans_array[[0]], n_neighbors=neighbors+1)

    print('Seed')
    print('ID:     ', root.loc[0,'id'], '\n')
    print('Energy: ', root.loc[0, 'energy']) # add/change/remove
    print('Danceability: ', root.loc[0, 'danceability']) # ? add/change/remove
    print('\nRecommendations')

    for n in range(1,neighbors+1):
        if rec[0][0][n] <= max_distance:
            print('ID:      ', merged_df.loc[rec[1][0][n],'id'])
            print('Distance:', rec[0][0][n], '\n')
            print('Energy:', merged_df.loc[rec[1][0][n],'energy']) # add/change/remove
            print('Danceability:', merged_df.loc[rec[1][0][n],'danceability']) # add/remove/change
            print('\n')
            
    if rec[0][0][1] > max_distance:
        print('No matches in catalog')

In [86]:
# Example

dj_rec('0Gc6TVm52BwZD07Ki6tIvf', max_distance=5.0, neighbors=3)

Seed
ID:      0Gc6TVm52BwZD07Ki6tIvf 

Energy:  0.326
Danceability:  0.24

Recommendations
ID:       758VqyHhAhzX6vmQ8h0exw
Distance: 3.486360493216998 

Energy: 0.303
Danceability: 0.273


ID:       0PnSFrAr48J5cF6Sf0ZkQa
Distance: 3.6727053102290093 

Energy: 0.186
Danceability: 0.209


ID:       4yKTDPH6iRBHmA44AipmIk
Distance: 3.6871322332566416 

Energy: 0.0743
Danceability: 0.353




In [87]:
# Find artist name from track id

sp.track('0Gc6TVm52BwZD07Ki6tIvf')['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/0TQUgpZqEnfluYEfKQBYB6'},
  'href': 'https://api.spotify.com/v1/artists/0TQUgpZqEnfluYEfKQBYB6',
  'id': '0TQUgpZqEnfluYEfKQBYB6',
  'name': 'Henri Salvador',
  'type': 'artist',
  'uri': 'spotify:artist:0TQUgpZqEnfluYEfKQBYB6'}]

In [88]:
# Find song name from track id

sp.track('0Gc6TVm52BwZD07Ki6tIvf')['name']

'Dis-moi monsieur Gordon Cooper'