In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from yellowbrick.target import FeatureCorrelation
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
spotify_df = pd.read_csv("data/data.csv")
year_data = pd.read_csv("data/data_by_year.csv")
spotify_df.columns
spotify_df.head()

In [None]:
spotify_df["year"] = pd.to_numeric(spotify_df["year"])

In [None]:
spotify_df = spotify_df.drop_duplicates(subset=["name", "artists"]).reset_index().drop('index', axis=1)
spotify_df.head()

### Data Exploration

##### Music Over Time

In [None]:
def get_decade(year):
    
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    
    return decade

spotify_df['decade'] = spotify_df['year'].apply(get_decade)

sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(spotify_df['decade'])

In [None]:
sns.lineplot(x='year', y='tempo', data=spotify_df)

Checking for the Feature Correlation by considering a few features 

In [None]:
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

X, y = spotify_df[feature_names], spotify_df['popularity']

# Create a list of the feature names
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

plt.rcParams['figure.figsize']=(10,5)
visualizer.fit(X, y)     # Fit the data to the visualizer
visualizer.show()

In [None]:
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
plt.rcParams['figure.figsize']=(8,5)
fig.show()

### Model Building

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [None]:
df_features = spotify_df[["id", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence", "year"]]
df_features.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
df_features.index = df_features["id"]
df_features.drop('id', axis=1, inplace=True)
# X_scaled = scaler.fit_transform(df_features)
df_features['loudness'] = df_features['loudness']+60
df_features['loudness'] = df_features['loudness']/63.855
df_features['tempo'] = df_features['tempo']/244.091

In [None]:
df_features.head()

In [None]:
model = NearestNeighbors(algorithm='kd_tree', n_neighbors=20)
model2 = NearestNeighbors(algorithm = 'brute', n_neighbors = 20)

We have made n_neighbors = 20. This essentially prepares us to give up to 20 different recommendations to our users!

In [None]:
sparse_mat_songs = csr_matrix(df_features.values)
mat_songs = np.array(df_features.values)

In [None]:
import timeit
# %%timeit
%timeit model.fit(mat_songs)
%timeit model2.fit(mat_songs)
#

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from data.spotify_keys import get_credentials
cid, secret = get_credentials()

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [None]:
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['artists'] = [results['artists'][0]['name']]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value
    df_ret = pd.DataFrame(song_data)
    return df_ret[["artists", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence", "year"]]

In [None]:
def recommend(name, year, model, number_of_recommendations):
    #query = df_features.loc[id].to_numpy().reshape(1,-1)
    query = find_song(name, year).drop('artists', axis=1).to_numpy().reshape(1,-1)
    print('Searching for recommendations, please wait...')
    print('                            Recommendations                           ')
    print('----------------------------------------------------------------------')
    distances, indices = model.kneighbors(query,n_neighbors = number_of_recommendations)
    # print(distances)
    print(indices)
    try:
        for i in indices:
            print(spotify_df[['name','artists']].loc[i].where(spotify_df['name']!=name).dropna())
        %timeit model.kneighbors(query,n_neighbors = number_of_recommendations)
    except:
        print("Some songs not found in DB, will be updated soon")
        

In [None]:
def recommend2(name, year, model, number_of_recommendations):
    #query = df_features.loc[id].to_numpy().reshape(1,-1)
    query = find_song(name, year).drop('artists', axis=1).to_numpy().reshape(1,-1)
    print('Searching for recommendations, please wait...')
    print('                            Recommendations                           ')
    print('----------------------------------------------------------------------')
    distances, indices = model.kneighbors(query,n_neighbors = number_of_recommendations)
    # print(distances)
    print(indices)
    try:
        for i in indices:
            print(spotify_df[['name','artists']].loc[i].where(spotify_df['name']!=name).dropna())
        %timeit model.kneighbors(query,n_neighbors = number_of_recommendations)
    except:
        print("Some songs not found in DB, will be updated soon")

In [None]:
def test_recs():
    name = input('Enter song title: ')
    # print('Search results: ')
    # print(spotify_df[['artists','name']].where(spotify_df['name'] == name).dropna())

    # ind = int(input('Enter the index value of your desired song: '))
    # id = spotify_df['id'].loc[ind]
    year = int(input("Enter the year the song was released"))

    
    artists = find_song(name, year)["artists"]

    print('The inputted song selected is ', name, 'by', artists)

    num_recs = int(input('Enter number of recommendations: '))

    recommend(name, year, model, num_recs)

In [None]:
def test_recs2():
    name = input('Enter song title: ')
    # print('Search results: ')
    # print(spotify_df[['artists','name']].where(spotify_df['name'] == name).dropna())

    # ind = int(input('Enter the index value of your desired song: '))
    # id = spotify_df['id'].loc[ind]
    year = int(input("Enter the year the song was released"))

    
    artists = find_song(name, year)["artists"]

    print('The inputted song selected is ', name, 'by', artists)

    num_recs = int(input('Enter number of recommendations: '))

    recommend2(name, year, model2, num_recs)

In [None]:
test_recs()

In [None]:
test_recs2()