In [17]:
# load data
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

df = pd.read_csv('../songdata.csv')
features = df.drop(columns=[col for col in df.columns if col.startswith('label_')])
features = features.drop(columns=['track_id', 'track_name', 'track_artist']) # columns not useful. Maybe use NLP for track_name/track_artist?
features

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,keymode_8_1,keymode_8_0,keymode_6_1,keymode_6_0,keymode_3_1,keymode_3_0,keymode_4_1,keymode_4_0,keymode_10_1,keymode_10_0
0,0.748,0.916,-2.634,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754,...,0,0,1,0,0,0,0,0,0,0
1,0.726,0.815,-4.969,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600,...,0,0,0,0,0,0,0,0,0,0
2,0.675,0.931,-3.432,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616,...,0,0,0,0,0,0,0,0,0,0
3,0.718,0.930,-3.778,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093,...,0,0,0,0,0,0,0,0,0,0
4,0.650,0.833,-4.672,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32828,0.428,0.922,-1.814,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375,...,0,0,0,0,0,0,0,0,0,0
32829,0.522,0.786,-4.462,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120,...,0,0,0,0,0,0,0,0,0,0
32830,0.529,0.821,-4.899,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112,...,0,0,0,1,0,0,0,0,0,0
32831,0.626,0.888,-3.361,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432,...,0,0,0,0,0,0,0,0,0,0


First, training off of labels as only the main genre, ignoring subgenre labels.

In [18]:
labels_main_only = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_only = labels_main_only.idxmax(axis=1).apply(lambda x: x.replace('label_', '').split('_')[0])
labels_main_only

0        pop
1        pop
2        pop
3        pop
4        pop
        ... 
32828    edm
32829    edm
32830    edm
32831    edm
32832    edm
Length: 32833, dtype: object

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_only, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = rf.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report for each label:\n", classification_report(y_test, y_pred))

Accuracy Score:  0.5523069894929191

Classification Report for each label:
               precision    recall  f1-score   support

         edm       0.65      0.70      0.67      1218
       latin       0.50      0.40      0.45      1033
         pop       0.36      0.32      0.34      1081
         r&b       0.47      0.47      0.47      1031
         rap       0.59      0.64      0.61      1168
        rock       0.68      0.76      0.72      1036

    accuracy                           0.55      6567
   macro avg       0.54      0.55      0.54      6567
weighted avg       0.54      0.55      0.55      6567



Then, training the Random Forest model off of genre+subgenre labels combined into one label.

In [19]:
labels_main_and_sub = df[[col for col in df.columns if col.startswith('label_')]]
labels_main_and_sub = labels_main_and_sub.idxmax(axis=1).apply(lambda x: x.replace('label_', ''))
labels_main_and_sub

0                        pop_dance pop
1                        pop_dance pop
2                        pop_dance pop
3                        pop_dance pop
4                        pop_dance pop
                     ...              
32828    edm_progressive electro house
32829    edm_progressive electro house
32830    edm_progressive electro house
32831    edm_progressive electro house
32832    edm_progressive electro house
Length: 32833, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_and_sub, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report for each label:\n", classification_report(y_test, y_pred))

Accuracy Score:  0.25140855794122124

Classification Report for each label:
                                precision    recall  f1-score   support

                 edm_big room       0.32      0.32      0.32       248
            edm_electro house       0.31      0.36      0.33       309
                  edm_pop edm       0.10      0.07      0.08       325
edm_progressive electro house       0.29      0.41      0.34       336
          latin_latin hip hop       0.15      0.13      0.14       315
              latin_latin pop       0.17      0.11      0.14       281
              latin_reggaeton       0.29      0.34      0.31       187
               latin_tropical       0.22      0.23      0.23       250
                pop_dance pop       0.12      0.12      0.12       236
               pop_electropop       0.14      0.08      0.10       299
          pop_indie poptimism       0.20      0.20      0.20       330
            pop_post-teen pop       0.11      0.09      0.10       216

Tuning:
Looking for hyperparameters via RandomizedSearchCV, first with only main genres:

In [23]:
# back to the main genre labels only
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_only, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': [int(x) for x in np.linspace(5, 700, num=20)],
    'max_depth': [None, 20, 30, 50, 75],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, class_weight='balanced')
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=20, cv=5, n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

y_pred = best_rf.predict(X_test)
print("Best Parameters:", random_search.best_params_)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


KeyboardInterrupt: 

Took 14m54s to run, producing:
```
Best Parameters: {'n_estimators': 553, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 30}
Accuracy Score:  0.5550479671082686
```
Not much of an increase. Now to try with subgenres:



In [24]:
# back to the main genre labels only
X_train, X_test, y_train, y_test = train_test_split(features, labels_main_and_sub, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': [int(x) for x in np.linspace(5, 700, num=5)],
    'max_depth': [None, 30, 75],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42, class_weight='balanced')
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=15, cv=5, n_jobs=-1, random_state=42, verbose=1)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

y_pred = best_rf.predict(X_test)
print("Best Parameters:", random_search.best_params_)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Parameters: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Accuracy Score:  0.26237246840261913

Classification Report:
                                precision    recall  f1-score   support

                 edm_big room       0.32      0.44      0.37       248
            edm_electro house       0.31      0.39      0.34       309
                  edm_pop edm       0.16      0.11      0.13       325
edm_progressive electro house       0.34      0.35      0.35       336
          latin_latin hip hop       0.15      0.05      0.08       315
              latin_latin pop       0.16      0.12      0.14       281
              latin_reggaeton       0.26      0.48      0.34       187
               latin_tropical       0.24      0.28      0.26       250
                pop_dance pop       0.13      0.14      0.13       236
               pop_electropop       0

```
Best Parameters: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Accuracy Score:  0.26237246840261913
```
Again, not a huge increase. This seems to be the limit for Random Forest models.