In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import copy
import seaborn as sns
import time
import heapq

sys.path.append(os.path.abspath('../util'))
from IPython.display import display,HTML,clear_output

In [2]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import joblib

In [3]:
data_path = Path('../data/')
model_path = Path('../model/')

In [4]:
df_f_path= data_path / Path('data.csv')
df_f = pd.read_csv(df_f_path)
df_f = df_f.drop(['artists', 'id', 'name', 'release_date', 'year', 'explicit'], axis=1)

In [5]:
df_min = df_f.min()
df_max = df_f.max()

In [6]:
df_min, df_max

(acousticness           0.0
 danceability           0.0
 duration_ms         5108.0
 energy                 0.0
 instrumentalness       0.0
 key                    0.0
 liveness               0.0
 loudness             -60.0
 mode                   0.0
 popularity             0.0
 speechiness            0.0
 tempo                  0.0
 valence                0.0
 dtype: float64,
 acousticness              0.996
 danceability              0.988
 duration_ms         5403500.000
 energy                    1.000
 instrumentalness          1.000
 key                      11.000
 liveness                  1.000
 loudness                  3.855
 mode                      1.000
 popularity              100.000
 speechiness               0.969
 tempo                   244.091
 valence                   1.000
 dtype: float64)

In [7]:
path = '../data/df_cleaned__by_artist.csv'
df = pd.read_csv(path)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(['explicit'], axis=1)
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,artists
0,0.998996,0.716599,0.028442,0.195,0.563,0.909091,0.151,0.745,1.0,0.0,0.052219,0.485348,0.779,Carl Woitschach
1,0.997992,0.383603,0.051316,0.0135,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Robert Schumann
2,0.997992,0.383603,0.051316,0.0135,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Vladimir Horowitz
3,0.606426,0.758097,0.018374,0.22,0.0,0.454545,0.119,0.627609,0.0,0.0,0.95872,0.439086,0.88,Seweryn Goszczyński
4,0.998996,0.790486,0.032538,0.13,0.887,0.090909,0.111,0.708887,0.0,0.0,0.095562,0.44247,0.72,Francisco Canaro


# Data Cleaning

Features Selection
Looking closely at the features, there are some features to consider whether to include in the final model or not:

id: id is unique for each track, therfore cannot assist a model and will be dropped.

name: There are 132,940 unique values. In our opinion, this is a problematic categorical feature to insert in a model, and will be dropped.

artists: there are 33,375 unique features. Since we enticipate the artist will influence the popularity of the track, this feature will remain and be handled seperatley.

release_date: The 'year' feature is a derivative to the 'release_date' feature, therfore one of them should be excluded. Since the release date contains rows with full date and others with year only, the release_date will be dropped.

In [8]:
# Cleaning duplicated data
print(df.duplicated().sum(),'duplicated data in the current data frame')
df = df[~df.duplicated()==1]
df.shape

1971 duplicated data in the current data frame


(224842, 14)

# Preprocessing

In [9]:
X = copy.deepcopy(df.drop(labels = ['artists'],axis = 1))
# maxvalues = np.max(X.to_numpy(),axis=0)
X.shape

(224842, 13)

In [10]:
arti_names = df.artists.to_numpy()

In [11]:
# song_features = pd.DataFrame()
# # normalizer instance
# scaler = MinMaxScaler()
# for col in X.columns:      # excluding year col i.e, of int64 type
#     scaler.fit(X[[col]])
#     song_features[col] = scaler.transform(X[col].values.reshape(-1,1)).ravel() 

In [12]:
# featuremap = song_features.to_numpy()

In [13]:
# featuremap.shape

# Acquiring features based on a single name

In [14]:
from spotify_interface import *

In [15]:
with open('../util/spotify_client.txt', 'rb') as f:
    (client_id, client_secret, oauth_token) = pkl.load(f)

In [16]:
scope = "user-library-read"
username = "Arth"
redirect_url = 'http://127.0.0.1:9090'

In [17]:
sp = get_spotify_token(username, scope, redirect_url, client_id,client_secret)

Token acquired for:  Arth


In [18]:
features = ['acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'explicit',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'popularity',
 'speechiness',
 'tempo',
 'valence']

In [19]:
len(features)

14

In [20]:
name = "Kanye West"
lim = 5

In [21]:
sp

<spotipy.client.Spotify at 0x7f76e69c9f50>

In [22]:
artist = get_artist([name], sp)
artist_dict = get_features(artist[0], sp, features)

In [23]:
artist_series = pd.Series(artist_dict)

In [24]:
artist_scaled = (artist_series-df_min) / (df_max - df_min)

In [25]:
rank = np.sum((X - artist_scaled)**2,axis=1)

In [26]:
idx = np.argpartition(rank, 6)
arti_names [idx][:5]

array(['Nick Jonas', ' Tinashe', 'blackbear', 'Kanye West',
       ' Chelsea Cutler'], dtype=object)

In [27]:
arti_names [idx][:2]

array(['Nick Jonas', ' Tinashe'], dtype=object)

In [28]:
rec = get_recommendations(arti_names [idx][:2], sp, lim= lim)

In [29]:
display(HTML("<hr></hr>"))
for i in range(0, lim):
    display(HTML(f"<h1> <b>Artist:</b> {rec[0]['tracks'][i]['artists'][0]['name']}\t </h1> <h2> <b>Album:</b> {rec[0]['tracks'][i]['album']['name']} </h2> <br> <img src='{rec[0]['tracks'][i]['album']['images'][1]['url']}'> <br> <audio controls><source src='{rec[0]['tracks'][i]['preview_url']}' type='audio/ogg'> </audio>"))
    display(HTML("<hr></hr>"))