In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from functions import *
from functions_plots import *
from collections import Counter

In [2]:
# Load data
data_folder_path = Path('../data/')
data = pd.read_csv(data_folder_path / 'data_train_all.csv')
data_columns = pd.read_csv(data_folder_path / 'data_train_all_columns.csv')

print("Training data length:", len(data))

predictor_columns = data_columns[data_columns['column_type'].isin(['predictor_n_grams', 'predictor_continuous'])]['column_name'].tolist()
print(f"Number of predictor columns: {len(predictor_columns)}")

target_columns = data_columns[data_columns['column_type'] == 'target']['column_name'].tolist()
print(f"Target columns: {target_columns}")

# Merge back with the raw data so that we can use the extra column to filter later
data_raw = pd.read_csv(data_folder_path / 'chordonomicon_raw.csv', low_memory=False)

print("All columns in the raw data:",list(data_raw.columns),'\n')
print("All columns in the training data:",list(data.columns))

keys = ['spotify_song_id', 'decade', 'main_genre']
data = data.merge(data_raw, on=keys, how='left', suffixes=('', '_raw'))

Training data length: 255606
Number of predictor columns: 119
Target columns: ['spotify_popularity', 'on_hot100', 'decade', 'main_genre']
All columns in the raw data: ['id', 'chords', 'release_date', 'genres', 'decade', 'rock_genre', 'artist_id', 'main_genre', 'spotify_song_id', 'spotify_artist_id'] 

All columns in the training data: ['chords_x', 'simplified_chords_x', 'spotify_song_id', 'chords_y', 'missing_notes', 'simplified_chords_y', 'drone_ratio', 'average_overlap', 'average_2overlap', 'average_3overlap', 'average_4overlap', 'average_5overlap', 'maj_triad_ratio', 'min_triad_ratio', 'chords', 'unique_5gram_density', 'unique_chord_density', 'simplified_chords', 'decade', 'main_genre', 'spotify_track_id', 'spotify_artist_id', 'spotify_success', 'spotify_track_name', 'spotify_artists', 'spotify_album_name', 'spotify_release_date', 'spotify_popularity', 'spotify_duration_ms', 'spotify_artist_name', 'spotify_genres', 'spotify_artist_popularity', 'spotify_followers', 'contains_G,C,G', 

In [3]:
# Select target variable and type

# target_variable = 'decade'
# target_type = 'multiclass'

target_variable = 'main_genre'
target_type = 'multiclass'

# target_variable = 'spotify_popularity'
# target_type = 'regression'

# target_variable = 'on_hot100'
# target_type = 'binary'

# Drop NaN values in target variable from entire dataset
if data[target_variable].isna().any():
    data = data[data[target_variable].notna()].reset_index(drop=True)
    print(f"Rows after dropping NaN in {target_variable}: {len(data)}")
else:
    print(f"No rows dropped, no missing values")

No rows dropped, no missing values


In [29]:
# select features here

#path_feature_selection = Path('../feature_selection/feature_selection')
#feature_selection = pd.read_csv(path_feature_selection / f'lasso_feature_importance_{target_variable}.csv')
#print(feature_selection.columns)

# # Plot feature importance for all features
# ax = plot_feature_importance(feature_selection)

# Plot feature importance for top N features
#ax = plot_feature_importance_df(feature_selection, top_n=30)

# Select top N features or comment out to use all features
#predictor_columns = feature_selection.head(20)['feature'].astype(str).tolist()

# extract n-gram features
feature_3_grams = [x for x in data.columns if 'contains' in x and x.count(',') == 2]
feature_4_grams = [x for x in data.columns if 'contains' in x and x.count(',') == 3]
feature_5_grams = [x for x in data.columns if 'contains' in x and x.count(',') == 4]

print("3-gram features:",feature_3_grams,'\n')
print("4-gram features:",feature_4_grams,'\n')
print("5-gram features:",feature_5_grams,'\n')

predictor_columns = feature_3_grams

3-gram features: ['contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D', 'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C', 'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F', 'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D', 'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G', 'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin', 'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C', 'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C', 'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin', 'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin', 'contains_Amin,D,G', 'contains_G,D,Amin'] 

4-gram features: ['contains_C,G,C,G', 'contains_G,C,G,C', 'contains_F,C,G,Amin', 'contains_C,G,Amin,F', 'contains_Amin,F,C,G', 'contains_G,D,C,G', 'contains_C,G,D,C', 'contains_G,Amin,F,C', 'contains_D,C,G,D', 'contains_D,G,C,G', 'contains_C,D,G,C', 'contains_G,C,D,G', 'contains_G,C,G,D', 'contains_C,G,D,G', 'contains_D,G,C,D', 'cont

In [30]:
# Filter data here
data_filtered = data[pd.to_numeric(data['decade'], errors='coerce') >= 1990].reset_index(drop=True)

unique_decades = pd.to_numeric(data_filtered['decade'], errors='coerce').dropna().astype(int).unique()
unique_decades = sorted(unique_decades)
print("Unique decades after filter:", unique_decades)

# Make a dataframe with predictor features
data_X = data_filtered[predictor_columns]

Unique decades after filter: [1990, 2000, 2010, 2020]


In [31]:
# 5-fold cross validation (stratified for classification to keep class ratios per fold)
if target_variable == 'spotify_popularity':
    # Use regular KFold for regression
    cv = KFold(n_splits=5, shuffle=True, random_state=145)
else:
    # Use StratifiedKFold for classification tasks
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=145)

# When modeling genre, we need to encode the labels (all other target variables are numeric already)
if target_variable == 'main_genre':
    le = LabelEncoder()
    data_y = le.fit_transform(data_filtered[target_variable])
else:
    data_y = data_filtered[target_variable]

In [32]:
genre_counter = Counter(data.main_genre)
genre_counter_total = sum(genre_counter.values())
genre_counter_relative = Counter({k : v/genre_counter_total for k, v in genre_counter.items()})
genre_counter_relative

Counter({'pop': 0.2414301698708168,
         'rock': 0.19097752008951277,
         'country': 0.1486467453815638,
         'alternative': 0.14038794081516084,
         'pop rock': 0.11276730593178563,
         'punk': 0.04784316487093417,
         'metal': 0.03363379576379271,
         'rap': 0.028770842624977505,
         'soul': 0.02039075765044639,
         'jazz': 0.017570010093659775,
         'reggae': 0.010887850832922544,
         'electronic': 0.006693896074427048})

In [33]:
print("Target variable:",target_variable)
print("Predictors:",predictor_columns,'\n')
results_dummy = evaluate_dummy_baseline(data_X, data_y, cv=cv, target_type=target_type, random_state=0)

Target variable: main_genre
Predictors: ['contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D', 'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C', 'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F', 'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D', 'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G', 'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin', 'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C', 'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C', 'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin', 'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin', 'contains_Amin,D,G', 'contains_G,D,Amin'] 


Evaluating Dummy Baseline...
Cross-validation folds: 5

Dummy Baseline - Multiclass Target
Parameters: strategy=most_frequent

Test Performance (Out-of-Sample):
------------------------------------------------------------
Accuracy                      :  0.2552 (+/- 0.0000)
Precision Micro               :  0.2

In [34]:
# Train and fit logistic regression model here (uses ridge regression for regression variables)
print("Target variable:",target_variable)
print("Predictors:",predictor_columns)
results_lr = train_logistic_regression(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    C=1.0,
    penalty='l2',
    solver='lbfgs',
    random_state=42
)

Target variable: main_genre
Predictors: ['contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D', 'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C', 'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F', 'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D', 'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G', 'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin', 'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C', 'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C', 'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin', 'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin', 'contains_Amin,D,G', 'contains_G,D,Amin']

Training Logistic Regression/Ridge...
Cross-validation folds: 5

Logistic Regression - Multiclass Target
Parameters: C=1.0, penalty=l2, solver=lbfgs

Test Performance (Out-of-Sample):
------------------------------------------------------------
Accuracy                      :  0.2743 (+/- 0.0005)
F1 Micro       

In [35]:
# Train and fit Lasso model here
print("Target variable:",target_variable)
print("Predictors:",predictor_columns,'\n')
results_lasso = train_lasso(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    alpha=0.1,
    max_iter=5000,
    random_state=42
)


Target variable: main_genre
Predictors: ['contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D', 'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C', 'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F', 'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D', 'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G', 'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin', 'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C', 'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C', 'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin', 'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin', 'contains_Amin,D,G', 'contains_G,D,Amin'] 


üìê Training Lasso...
Cross-validation folds: 5

Lasso - Multiclass Target
Parameters: alpha=0.1, max_iter=5000

Test Performance (Out-of-Sample):
------------------------------------------------------------
Accuracy                      :  0.2743 (+/- 0.0005)
F1 Micro                      :  0.2743 (+/- 0.00

In [45]:
# Train and fit Random Forests model here
print("Target variable:",target_variable)
print("Predictors:",predictor_columns,'\n')
results_rf = train_random_forest(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    n_estimators=100,
    max_depth=5,
    random_state=42
)

Target variable: main_genre
Predictors: ['contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D', 'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C', 'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F', 'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D', 'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G', 'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin', 'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C', 'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C', 'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin', 'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin', 'contains_Amin,D,G', 'contains_G,D,Amin'] 


Training Random Forest...
Cross-validation folds: 5

Random Forest - Multiclass Target
Parameters: n_estimators=100, max_depth=5, min_samples_leaf=1

Test Performance (Out-of-Sample):
------------------------------------------------------------
Accuracy                      :  0.2723 (+/- 0.0006)
F1 Micro     