In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from modeling.functions import *
from modeling.functions_plots import *
from collections import Counter

In [2]:
# Load data
data_folder_path = Path('../data/')
data = pd.read_csv(data_folder_path / 'data_train_all.csv')
data_columns = pd.read_csv(data_folder_path / 'data_train_all_columns.csv')

print("Training data length:", len(data))

predictor_columns = data_columns[data_columns['column_type'].isin(['predictor_n_grams', 'predictor_continuous'])]['column_name'].tolist()
print(f"Number of predictor columns: {len(predictor_columns)}")

target_columns = data_columns[data_columns['column_type'] == 'target']['column_name'].tolist()
print(f"Target columns: {target_columns}")

Training data length: 255606
Number of predictor columns: 119
Target columns: ['spotify_popularity', 'on_hot100', 'decade', 'main_genre']


In [3]:
# Select target variable and type
target_variable = 'main_genre'
target_type = 'multiclass'

# Drop NaN values in target variable from entire dataset
if data[target_variable].isna().any():
    data = data[data[target_variable].notna()].reset_index(drop=True)
    print(f"Rows after dropping NaN in {target_variable}: {len(data)}")
else:
    print(f"No rows dropped, no missing values")

No rows dropped, no missing values


In [4]:
# extract n-gram features
path_feature_selection = Path('../feature_engineering/feature_selection/feature_selection_csv')
feature_selection = pd.read_csv(path_feature_selection / f'lasso_feature_importance_{target_variable}.csv')

feature_n_grams = feature_selection[feature_selection['feature'].str.contains('contains')]
feature_4_grams = feature_selection[feature_selection['feature'].str.count(',')==3]
predictor_columns = list(feature_4_grams['feature'])

print(len(predictor_columns))
print(predictor_columns[0:10])

39
['contains_G,C,G,D', 'contains_G,C,G,C', 'contains_C,G,Amin,F', 'contains_G,D,G,C', 'contains_G,F,G,C', 'contains_C,D,G,D', 'contains_F,C,G,Amin', 'contains_F,G,Amin,F', 'contains_D,G,C,G', 'contains_G,C,D,G']


In [5]:
# Make a dataframe with predictor features
data_X = data[predictor_columns]
data_X.head(3)

Unnamed: 0,"contains_G,C,G,D","contains_G,C,G,C","contains_C,G,Amin,F","contains_G,D,G,C","contains_G,F,G,C","contains_C,D,G,D","contains_F,C,G,Amin","contains_F,G,Amin,F","contains_D,G,C,G","contains_G,C,D,G",...,"contains_C,G,D,C","contains_F,G,C,Amin","contains_C,G,D,G","contains_G,Amin,F,G","contains_C,Amin,C,Amin","contains_C,G,Emin,C","contains_C,G,C,G","contains_Amin,G,Amin,G","contains_G,F,Amin,G","contains_G,Amin,G,Amin"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,1,0,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [6]:
# Use StratifiedKFold for classification tasks
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=145)

# We need to encode the labels
le = LabelEncoder()
data_y = le.fit_transform(data[target_variable])

In [7]:
genre_counter = Counter(data.main_genre)
genre_counter_total = sum(genre_counter.values())
genre_counter_relative = Counter({k : v/genre_counter_total for k, v in genre_counter.items()})
genre_counter_relative

Counter({'pop': 0.2414301698708168,
         'rock': 0.19097752008951277,
         'country': 0.1486467453815638,
         'alternative': 0.14038794081516084,
         'pop rock': 0.11276730593178563,
         'punk': 0.04784316487093417,
         'metal': 0.03363379576379271,
         'rap': 0.028770842624977505,
         'soul': 0.02039075765044639,
         'jazz': 0.017570010093659775,
         'reggae': 0.010887850832922544,
         'electronic': 0.006693896074427048})

In [8]:
print("Target variable:",target_variable)
results_dummy = evaluate_dummy_baseline(data_X, data_y, cv=cv, target_type=target_type, random_state=0)

Target variable: main_genre

Evaluating Dummy Baseline...
Cross-validation folds: 5

Dummy Baseline - Multiclass Target
Parameters: strategy=most_frequent

Test Performance (Out-of-Sample):
------------------------------------------------------------
Accuracy                      :  0.2414 (+/- 0.0000)
Precision Micro               :  0.2414 (+/- 0.0000)




In [9]:
# Train and fit logistic regression model here (uses ridge regression for regression variables)
print("Predicting:",target_variable)
print("Using the features:",predictor_columns)
print("Using a logistic regression.")

results_lr = train_logistic_regression(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    C=1.0,
    penalty='l2',
    solver='lbfgs',
    random_state=42,
    print_cv = True
)

Predicting: main_genre
Using the features: ['contains_G,C,G,D', 'contains_G,C,G,C', 'contains_C,G,Amin,F', 'contains_G,D,G,C', 'contains_G,F,G,C', 'contains_C,D,G,D', 'contains_F,C,G,Amin', 'contains_F,G,Amin,F', 'contains_D,G,C,G', 'contains_G,C,D,G', 'contains_G,Emin,C,G', 'contains_D,C,D,C', 'contains_Amin,F,G,C', 'contains_Amin,C,G,Amin', 'contains_G,D,C,G', 'contains_G,C,G,Amin', 'contains_C,Amin,F,G', 'contains_G,Amin,C,G', 'contains_F,G,F,G', 'contains_Amin,F,C,G', 'contains_G,Amin,F,C', 'contains_Amin,G,F,C', 'contains_D,G,C,D', 'contains_C,G,Amin,C', 'contains_Amin,F,G,Amin', 'contains_G,Amin,G,F', 'contains_D,C,G,D', 'contains_F,Amin,G,F', 'contains_C,D,G,C', 'contains_C,G,D,C', 'contains_F,G,C,Amin', 'contains_C,G,D,G', 'contains_G,Amin,F,G', 'contains_C,Amin,C,Amin', 'contains_C,G,Emin,C', 'contains_C,G,C,G', 'contains_Amin,G,Amin,G', 'contains_G,F,Amin,G', 'contains_G,Amin,G,Amin']
Using a logistic regression.

Training Logistic Regression/Ridge...
Cross-validation folds: 

In [10]:
# Train and fit Lasso model here
print("Predicting:",target_variable)
print("Using the features:",predictor_columns)
print("Using a lasso regression.")

results_lasso = train_lasso(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    alpha=0.1,
    max_iter=5000,
    random_state=42,
    print_cv = True
)


Predicting: main_genre
Using the features: ['contains_G,C,G,D', 'contains_G,C,G,C', 'contains_C,G,Amin,F', 'contains_G,D,G,C', 'contains_G,F,G,C', 'contains_C,D,G,D', 'contains_F,C,G,Amin', 'contains_F,G,Amin,F', 'contains_D,G,C,G', 'contains_G,C,D,G', 'contains_G,Emin,C,G', 'contains_D,C,D,C', 'contains_Amin,F,G,C', 'contains_Amin,C,G,Amin', 'contains_G,D,C,G', 'contains_G,C,G,Amin', 'contains_C,Amin,F,G', 'contains_G,Amin,C,G', 'contains_F,G,F,G', 'contains_Amin,F,C,G', 'contains_G,Amin,F,C', 'contains_Amin,G,F,C', 'contains_D,G,C,D', 'contains_C,G,Amin,C', 'contains_Amin,F,G,Amin', 'contains_G,Amin,G,F', 'contains_D,C,G,D', 'contains_F,Amin,G,F', 'contains_C,D,G,C', 'contains_C,G,D,C', 'contains_F,G,C,Amin', 'contains_C,G,D,G', 'contains_G,Amin,F,G', 'contains_C,Amin,C,Amin', 'contains_C,G,Emin,C', 'contains_C,G,C,G', 'contains_Amin,G,Amin,G', 'contains_G,F,Amin,G', 'contains_G,Amin,G,Amin']
Using a lasso regression.

üìê Training Lasso...
Cross-validation folds: 5

Lasso - Multicl

In [11]:
# Train and fit Random Forests model here
print("Predicting:",target_variable)
print("Using the features:",predictor_columns)
print("Using a random forest model.")

results_rf = train_random_forest(
    data_X,
    data_y,
    cv=cv,
    target_type=target_type,
    n_estimators=100,
    max_depth=5,
    random_state=42,
    print_cv = True
)

Predicting: main_genre
Using the features: ['contains_G,C,G,D', 'contains_G,C,G,C', 'contains_C,G,Amin,F', 'contains_G,D,G,C', 'contains_G,F,G,C', 'contains_C,D,G,D', 'contains_F,C,G,Amin', 'contains_F,G,Amin,F', 'contains_D,G,C,G', 'contains_G,C,D,G', 'contains_G,Emin,C,G', 'contains_D,C,D,C', 'contains_Amin,F,G,C', 'contains_Amin,C,G,Amin', 'contains_G,D,C,G', 'contains_G,C,G,Amin', 'contains_C,Amin,F,G', 'contains_G,Amin,C,G', 'contains_F,G,F,G', 'contains_Amin,F,C,G', 'contains_G,Amin,F,C', 'contains_Amin,G,F,C', 'contains_D,G,C,D', 'contains_C,G,Amin,C', 'contains_Amin,F,G,Amin', 'contains_G,Amin,G,F', 'contains_D,C,G,D', 'contains_F,Amin,G,F', 'contains_C,D,G,C', 'contains_C,G,D,C', 'contains_F,G,C,Amin', 'contains_C,G,D,G', 'contains_G,Amin,F,G', 'contains_C,Amin,C,Amin', 'contains_C,G,Emin,C', 'contains_C,G,C,G', 'contains_Amin,G,Amin,G', 'contains_G,F,Amin,G', 'contains_G,Amin,G,Amin']
Using a random forest model.

Training Random Forest...
Cross-validation folds: 5

Random Fo

In [12]:
# load the final test data
data_test = pd.read_csv(data_folder_path / 'data_test_all.csv')

X_test = data_test[predictor_columns]

le = LabelEncoder()
y_test = le.fit_transform(data_test[target_variable])

In [13]:
# final model choice: random forest using the features above
random_forest_classifier = RandomForestClassifier(n_estimators = 100,
                                                  max_depth = 5,
                                                  min_samples_split = 2,
                                                  min_samples_leaf = 1,
                                                  random_state = 42,
                                                  n_jobs = -1)

random_forest_classifier.fit(data_X,data_y)
y_pred = random_forest_classifier.predict(data_test[predictor_columns])
accuracy = accuracy_score(y_test,y_pred)

In [14]:
print(accuracy)

0.26277517901877756
