In [None]:
#import data
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
path_name ="filtered_data.csv"

data = pd.read_csv(path_name)
data.drop_duplicates()
data

In [None]:
data.info()

In [None]:
#casting playlist_pid to category
data['playlist_pid'] = data['playlist_pid'].astype('category')

#casting key to category
data['key'] = data['key'].astype('category')

#casting mode to category
data['mode'] = data['mode'].astype('category')

In [None]:
data.info()

In [None]:
#dataframe of features only 
features_df = data[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']]

#dataframe of target variable (playlist_pid)
target_df = data[['playlist_pid']]

In [None]:
features_df

# FEATURE SELECTION

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns

X = features_df
X = pd.get_dummies(X, columns=['key', 'mode'])

y = target_df   #target column i.e price range

model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

In [None]:
# add in playlist_pid as a target category
features_with_target = X.join(y)

In [None]:
# correlation matrix + heatmap for variables
corr_values = features_with_target.corr()
plt.figure(figsize = (20,10))
sns.heatmap(corr_values, cmap="Blues", annot=True, fmt='.2f')

In [None]:
X_features = ['duration_ms', 'tempo', 'valence', 'liveness', 'instrumentalness', 'acousticness', 'speechiness', 
              'loudness', 'energy', 'danceability', 'mode']

X = data[X_features]

# Training the Model

Key, Mode -> are categorical and need to be encoded

All other features -> are numeric and need to be normalized

In [None]:
#split test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7, stratify=y)

#encode categorical for Train set
X_train = pd.get_dummies(X_train)

In [None]:
#normalization of numerical data for Train set
numericals = X_train[['duration_ms', 'tempo', 'valence', 'liveness', 'instrumentalness', 'acousticness', 'speechiness', 'loudness', 'energy', 'danceability']]
scaler = StandardScaler()
scaler.fit(numericals)

X_train = scaler.transform(numericals) #scale the training data

## Hypertuning

In [None]:
import warnings
warnings.filterwarnings('ignore')

#K-Nearest Neighbors
param = {'n_neighbors': np.arange(20,40), 
         'weights': ('uniform', 'distance'), 
         #'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
         #'leaf-size': np.arange(15, 30),
         #'p': (1, 2),
         #'metric': ('minkowski', 'chebyshev')
        }

knn = KNeighborsClassifier() #initialize the classifier and set the number of neighbors
knn_grid= GridSearchCV(knn, param, cv=5, verbose=3)
knn_grid.fit(X_train, y_train)

#knn.fit(X_train, y_train)

In [None]:
print(knn_grid.best_params_)
print(knn_grid.best_score_)

In [None]:
# Regular training
# knn = KNeighborsClassifier(n_neighbors = 5) #initialize the classifier and set the number of neighbors
# knn.fit(X_train, y_train)

In [None]:
#encode categorical for Test set
X_test = pd.get_dummies(X_test)

#normalization of numerical data for Test set
numericals_test = X_test[['duration_ms', 'tempo', 'valence', 'liveness', 'instrumentalness', 'acousticness', 'speechiness', 'loudness', 'energy', 'danceability']]
X_test = scaler.transform(numericals_test) #scale the test data

**Predictions and Accuracy**

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

y_pred = []
for x in tqdm(range(len(X_test))):
    y_pred.append(knn_grid.predict(pd.DataFrame.transpose(pd.DataFrame(X_test[x]))))

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
X_all = pd.get_dummies(X)

numericals_all = X_all[['duration_ms', 'tempo', 'valence', 'liveness', 'instrumentalness', 'acousticness', 'speechiness', 'loudness', 'energy', 'danceability']]

scaler = StandardScaler()
scaler.fit(numericals_all)

X_all = scaler.transform(numericals_all)

In [None]:
track_uris = data['track_uri']

In [None]:
playlist_ids = np.unique(y.to_numpy()).tolist()

In [None]:
results = {}

for playlist_id in playlist_ids:
    results[playlist_id] = {}

for i in range(len(track_uris)):
    song_results = knn_grid.predict_proba(X_all[i].reshape(1, -1)).tolist()[0]
    for j in range(len(playlist_ids)):
        results[playlist_ids[j]][track_uris[i]] = song_results[j]

In [None]:
def get_best_song(playlist_id):
    best_song = max(results[playlist_id], key=results[playlist_id].get)
    
    return data.loc[data['track_uri']==best_song, ['track_name', 'artist_name']].reset_index(drop=True)

In [None]:
get_best_song(115006)

In [None]:
pd.DataFrame(results)