In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Read the CSV file
df = pd.read_csv('../songdata.csv')

features = df.drop(columns=[col for col in df.columns if col.startswith('label_')])
features = features.drop(columns=['track_id', 'track_name', 'track_artist'])

labels_main_only = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_only = labels_main_only.idxmax(axis=1).apply(lambda x: x.replace('label_', '').split('_')[0])

labels_main_and_sub = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_and_sub = labels_main_and_sub.idxmax(axis=1).apply(lambda x: x.replace('label_', ''))



First, attempting a basic KNN model with only main genre labels, at K = 5:

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_only, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report for each label:\n", classification_report(y_test, y_pred))

Accuracy Score:  0.26907263590680675

Classification Report for each label:
               precision    recall  f1-score   support

         edm       0.31      0.49      0.38      1218
       latin       0.22      0.24      0.23      1033
         pop       0.23      0.22      0.22      1081
         r&b       0.24      0.22      0.23      1031
         rap       0.29      0.20      0.24      1168
        rock       0.30      0.22      0.25      1036

    accuracy                           0.27      6567
   macro avg       0.27      0.26      0.26      6567
weighted avg       0.27      0.27      0.26      6567



this gives a relatively low accuracy score,
```
Accuracy Score:  0.26907263590680675
```

Next, trying the same model with genre+subgenre labels:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_and_sub, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report for each label:\n", classification_report(y_test, y_pred))

Unsurprisingly, the accuracy score is even lower,
```
Accuracy Score:  0.06624029237094564
```

Next, trying to tune the hyperparameters with RandomizedSearchCV, starting with main genres only:

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_only, test_size=0.2, random_state=42)

param_dist = {
    'n_neighbors': [int(x) for x in np.linspace(1, 40, num=10)],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_tune = KNeighborsClassifier()
grid_search = GridSearchCV(estimator=knn_tune, param_grid=param_dist, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
y_pred_tuned = best_knn.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Tuned Accuracy Score: ", accuracy_score(y_test, y_pred_tuned))
print("\nTuned Classification Report:\n", classification_report(y_test, y_pred_tuned))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 40, 'weights': 'distance'}
Tuned Accuracy Score:  0.32724227196589006

Tuned Classification Report:
               precision    recall  f1-score   support

         edm       0.38      0.50      0.43      1218
       latin       0.30      0.27      0.29      1033
         pop       0.23      0.25      0.24      1081
         r&b       0.28      0.27      0.28      1031
         rap       0.35      0.28      0.31      1168
        rock       0.42      0.36      0.39      1036

    accuracy                           0.33      6567
   macro avg       0.33      0.32      0.32      6567
weighted avg       0.33      0.33      0.32      6567



The accuracy score is noticably improved from the previous 0.269,
```
Tuned Accuracy Score:  0.327089995431704
```

However, this is still low, and it's expected for Random Forest and gradient boosting to perform significantly better. Next, trying with genre+subgenre labels:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_and_sub, test_size=0.2, random_state=42)

param_dist = {
    'n_neighbors': [int(x) for x in np.linspace(1, 40, num=10)],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_tune = KNeighborsClassifier()
random_search = RandomizedSearchCV(estimator=knn_tune, param_distributions=param_dist,
                                   n_iter=20, cv=5, n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)

best_knn = random_search.best_estimator_
y_pred_tuned = best_knn.predict(X_test)

print("Best Parameters:", random_search.best_params_)
print("Tuned Accuracy Score: ", accuracy_score(y_test, y_pred_tuned))
print("\nTuned Classification Report:\n", classification_report(y_test, y_pred_tuned))

A slight improvement from the previous 0.066,
```
Tuned Accuracy Score:  0.08420892340490331
```