In [56]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import copy
import seaborn as sns
import time
import heapq

sys.path.append(os.path.abspath('../util'))


In [14]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import joblib

In [15]:
data_path = Path('../data/')
model_path = Path('../model/')

In [21]:
df_f_path= data_path / Path('data.csv')
df_f = pd.read_csv(df_f_path)
df_f = df_f.drop(['artists', 'id', 'name', 'release_date', 'year', 'explicit'], axis=1)

In [22]:
df_min = df_f.min()
df_max = df_f.max()

In [23]:
path = '../data/df_cleaned__by_artist.csv'
df = pd.read_csv(path)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,artists
0,0.998996,0.716599,0.028442,0.195,0.0,0.563,0.909091,0.151,0.745,1.0,0.0,0.052219,0.485348,0.779,Carl Woitschach
1,0.997992,0.383603,0.051316,0.0135,0.0,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Robert Schumann
2,0.997992,0.383603,0.051316,0.0135,0.0,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Vladimir Horowitz
3,0.606426,0.758097,0.018374,0.22,0.0,0.0,0.454545,0.119,0.627609,0.0,0.0,0.95872,0.439086,0.88,Seweryn Goszczyński
4,0.998996,0.790486,0.032538,0.13,0.0,0.887,0.090909,0.111,0.708887,0.0,0.0,0.095562,0.44247,0.72,Francisco Canaro


# Data Cleaning

Features Selection
Looking closely at the features, there are some features to consider whether to include in the final model or not:

id: id is unique for each track, therfore cannot assist a model and will be dropped.

name: There are 132,940 unique values. In our opinion, this is a problematic categorical feature to insert in a model, and will be dropped.

artists: there are 33,375 unique features. Since we enticipate the artist will influence the popularity of the track, this feature will remain and be handled seperatley.

release_date: The 'year' feature is a derivative to the 'release_date' feature, therfore one of them should be excluded. Since the release date contains rows with full date and others with year only, the release_date will be dropped.

In [24]:
# df = df.drop(labels=['id', 'name', 'release_date','year','explicit'], axis=1)
df = df.drop(labels=['explicit'], axis=1)
df.shape

(226813, 14)

In [25]:
df

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,artists
0,0.998996,0.716599,0.028442,0.1950,0.56300,0.909091,0.1510,0.745000,1.0,0.00,0.052219,0.485348,0.7790,Carl Woitschach
1,0.997992,0.383603,0.051316,0.0135,0.90100,0.727273,0.0763,0.494026,1.0,0.00,0.047678,0.344019,0.0767,Robert Schumann
2,0.997992,0.383603,0.051316,0.0135,0.90100,0.727273,0.0763,0.494026,1.0,0.00,0.047678,0.344019,0.0767,Vladimir Horowitz
3,0.606426,0.758097,0.018374,0.2200,0.00000,0.454545,0.1190,0.627609,0.0,0.00,0.958720,0.439086,0.8800,Seweryn Goszczyński
4,0.998996,0.790486,0.032538,0.1300,0.88700,0.090909,0.1110,0.708887,0.0,0.00,0.095562,0.442470,0.7200,Francisco Canaro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226808,0.540161,0.520243,0.032527,0.5390,0.00233,0.636364,0.1080,0.793485,1.0,0.70,0.108359,0.506778,0.1530,Kygo
226809,0.540161,0.520243,0.032527,0.5390,0.00233,0.636364,0.1080,0.793485,1.0,0.70,0.108359,0.506778,0.1530,Oh Wonder
226810,0.071687,0.653846,0.030046,0.7610,0.00000,0.090909,0.2220,0.899585,1.0,0.70,0.039732,0.532244,0.4720,Cash Cash
226811,0.071687,0.653846,0.030046,0.7610,0.00000,0.090909,0.2220,0.899585,1.0,0.70,0.039732,0.532244,0.4720,Andy Grammer


In [26]:
# Cleaning duplicated data
print(df.duplicated().sum(),'duplicated data in the current data frame')
df = df[~df.duplicated()==1]
df.shape

1971 duplicated data in the current data frame


(224842, 14)

# Preprocessing

In [43]:
X = copy.deepcopy(df.drop(labels = ['artists'],axis = 1))
# maxvalues = np.max(X.to_numpy(),axis=0)

In [47]:
arti_names = df.artists.to_numpy()

In [27]:
# song_features = pd.DataFrame()
# # normalizer instance
# scaler = MinMaxScaler()
# for col in X.columns:      # excluding year col i.e, of int64 type
#     scaler.fit(X[[col]])
#     song_features[col] = scaler.transform(X[col].values.reshape(-1,1)).ravel() 

In [45]:
# featuremap = song_features.to_numpy()

In [28]:
# featuremap.shape

# Acquiring features based on a single name

In [57]:
from spotify_interface import *

In [58]:
with open('../util/spotify_client.txt', 'rb') as f:
    (client_id, client_secret, oauth_token) = pkl.load(f)

In [59]:
scope = "user-library-read"
username = "Arth"
redirect_url = 'http://127.0.0.1:9090'

In [60]:
sp = get_spotify_token(username, scope, redirect_url, client_id,client_secret)

Couldn't read cache at: .cache-Arth
Couldn't read cache at: .cache-Arth


Token acquired for:  Arth


In [62]:
features = ['acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'explicit',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'popularity',
 'speechiness',
 'tempo',
 'valence']

In [78]:
name = "Cardi-B"

In [79]:
artist = get_artist([name], sp)
artist_dict = get_features(artist[0], sp, features)

In [80]:
artist_series = pd.Series(artist_dict)

In [81]:
artist_scaled = (artist_series-df_min) / (df_max - df_min)

In [82]:
rank = np.sum((X - artist_scaled)**2,axis=1)

In [83]:
idx = np.argpartition(rank, 6)
arti_names [idx][:15]

array(['6ix9ine', 'Lil Baby', ' Murda Beatz', ' Nicki Minaj', ' Gunna',
       'Future', 'Gunna', 'Eminem', ' Travis Scott', ' Quavo', 'Drake',
       'Lil Baby', 'Kanye West', 'Cardi B', ' 42 Dugg'], dtype=object)