In [1]:
import pandas as pd
data = pd.read_csv('dataset.csv')

# Preprocessing

In [2]:
# Remove columns that are irrelevant to the model
data = data.drop(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity','Unnamed: 0'], axis=1)
# remove time_signature, it's mostly 4/4 in this dataset
data = data.drop(['time_signature'], axis=1)

In [3]:
# encode class targets
from sklearn.preprocessing import LabelEncoder
labEnc = LabelEncoder()
y = data['track_genre']
y = labEnc.fit_transform(y)

In [4]:
# set up traing and test sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = data.drop('track_genre', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Use Standard Scaler to standardize the numerical features. It removes the mean and scales each feature/variable to unit variance.
# z-score normalization
# transfroms data to have mean of 0 and standard deviation of 1
from sklearn.preprocessing import StandardScaler
continuous_features = ['duration_ms', 'danceability', 'energy', 'loudness', 
                       'speechiness', 'acousticness', 'instrumentalness', 
                       'liveness', 'valence', 'tempo' ]

scaler = StandardScaler()
scaler.fit(X_train[continuous_features])
X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

The categorical features are already encoded as integers. Explicit and mode are already coded to 0 and 1. Key is an integer between 1 and 11

# Baselines

In [6]:
# random guess accuracy
genre_counts = 114
from sklearn.dummy import DummyClassifier
random_guessing_accuracy = 1 / genre_counts
dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
dummy_clf.fit(X_train, y_train)
dummy_predictions = dummy_clf.predict(X_test)
dummy_accuracy = accuracy_score(y_test, dummy_predictions)
random_guessing_accuracy, dummy_accuracy

(0.008771929824561403, 0.008640350877192983)

Random guess accruacy of 0.88% is very low, which is expected due to the large number of genres.

In [8]:
# Decision Tree Classifier Baseline Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.1676315789473684

Decision Tree Classifier baseline model yielded an accuracy of 16.5%. Significantly better than random guess accuracy. Decent baseline to start with considering the simplicity of the model and large number of classes.

# Tuned Decision Tree

In [9]:
# search for best parameters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'max_depth': list(range(10, 51, 10)),
    'min_samples_split': list(range(10, 101, 20)),
}
random_search = RandomizedSearchCV(dt, param_grid, n_iter=25, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
best_params = random_search.best_params_
best_params

{'min_samples_split': 70, 'max_depth': 40}

In [11]:
# Train and test with tuned parameters from RandomizedSearchCV
dt = DecisionTreeClassifier(**best_params, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred)
tuned_accuracy

0.18140350877192982

Decision Tree Model with tuned paramters yielded an accuracy of 18.1%. This is small but I think significant increase in the accuracy over the baseline Decicion Tree model.

# MLP CLassifier

In [14]:
# Train a NN classifier for X to classify genre (y)
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=100, random_state=42, early_stopping=True)
# mlp = MLPClassifier(hidden_layer_sizes=(26, 26), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.21653508771929825

Nueral Network Model yielded an accuracy of 22.36%. This is an improvement over the tuned Decistion Tree model.  
Next step is to tune the Nueral Network model.

In [None]:
# Don't execute this block, takes too long
# Search for best parameters for NN model with GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(13,), (26,), (13, 13), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 1],
    'learning_rate': ['constant', 'adaptive'],
}
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_

GridSearchCV was stopped after an hour. Next we'll try RandomizedSearchCV with

In [31]:
# number of features in training data
n_features = X_train.shape[1]
n_features

13

In [12]:
# Tune paramters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'hidden_layer_sizes': [(13,), (26,), (13, 13), (26, 26), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 1],
}

# subset_size = 10000
# X_train_subset = X_train_scaled[:subset_size]
# y_train_subset = y_train[:subset_size]

random_search = RandomizedSearchCV(mlp, param_grid, n_iter=24, cv=3, scoring='accuracy', n_jobs=-1)
# random_search.fit(X_train_subset, y_train_subset)
random_search.fit(X_train, y_train)
best_params = random_search.best_params_
best_params

NameError: name 'mlp' is not defined

In [39]:
# Train and evaluate the classifier with the best parameters
mlp_best = MLPClassifier(**best_params, max_iter=10000, random_state=42)
mlp_best.fit(X_train_scaled, y_train)
y_pred_best = mlp_best.predict(X_test_scaled)
tuned_mlp_accuracy = accuracy_score(y_test, y_pred_best)
tuned_mlp_accuracy

0.18350877192982457

# Random Forest Classifier

In [21]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_accuracy

0.25049122807017543

In [22]:
# Tune parameters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid_rf = {
    'n_estimators': [5, 10, , 13, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}
random_search_rf = RandomizedSearchCV(rf_clf, param_grid_rf, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1, random_state=42, verbose=2)
random_search_rf.fit(X_train, y_train)
best_params_rf = random_search_rf.best_params_
best_params_rf

Fitting 3 folds for each of 20 candidates, totalling 60 fits


33 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Alec\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Alec\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\Alec\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Alec\AppData\Local\Programs\Python\Python311\Lib\site

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20}

In [23]:
# Tuned Random Forest Model
rf_clf_best = RandomForestClassifier(**best_params_rf, random_state=42)
rf_clf_best.fit(X_train, y_train)
y_pred_rf_best = rf_clf_best.predict(X_test)
tuned_rf_accuracy = accuracy_score(y_test, y_pred_rf_best)
tuned_rf_accuracy

0.25652631578947366

# Gradient Boosting Classifier

In [14]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_accuracy

0.21618421052631578

# SVM Model

In [36]:
# SVM Model
from sklearn import svm
from sklearn.preprocessing import StandardScaler
svm_clf = svm.LinearSVC(random_state=42)
svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_accuracy



0.14619298245614035

# Logisitic Regression

In [37]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
log_reg_clf = LogisticRegression(random_state=42)
log_reg_clf.fit(X_train, y_train)
y_pred_log_reg = log_reg_clf.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.16791228070175437

The Logistic Regression model achieved an accuracy of approximately 16.03%, which is an improvement over the SVM model but still lower than the Random Forest model.

In [38]:
# LogisticRegression with higher max_iter
log_reg_clf = LogisticRegression(max_iter=1000, random_state=42)
log_reg_clf.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg_clf.predict(X_test_scaled)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_accuracy

0.16785964912280701

# Ensemble Stacking Model

In [40]:
from sklearn.ensemble import StackingClassifier

# base models
level0 = list()
level0.append(('lr', LogisticRegression(max_iter=100000)))
level0.append(('rf', RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=30)))
level0.append(('svm', svm.LinearSVC()))

# meta learner model
level1 = LogisticRegression()
stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
stacking_model.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_model.predict(X_test_scaled)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_accuracy


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.2718421052631579

In [None]:
# GridSearch parameter tuning for gradient boosting classifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 10]
}
gb_clf = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb_clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

Stopped gradient parameter tuning after 30+ minutes. 

In [1]:
# Tune the hyperparameters of the models
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

param_dist_svm = {
    'C': [0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge']
}
param_dist_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 10]
}

svm = LinearSVC(max_iter=10000, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

random_search_svm = RandomizedSearchCV(svm, param_dist_svm, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search_gb = RandomizedSearchCV(gb, param_dist_gb, n_iter=10, cv=5, scoring='accuracy', random_state=42)

random_search_svm.fit(X_train_scaled, y_train)
random_search_gb.fit(X_train, y_train)

best_params_svm = random_search_svm.best_params_
best_params_gb = random_search_gb.best_params_

best_params_svm, best_params_gb

: 

: 

In [None]:
from sklearn.ensemble import StackingClassifier

# base models
level0 = list()
level0.append(('lr', LogisticRegression(**best_params_lr)))   # assuming best_params_lr contains the best parameters for Logistic Regression
level0.append(('rf', RandomForestClassifier(**best_params_rf)))  # assuming best_params_rf contains the best parameters for Random Forest
level0.append(('svm', svm.LinearSVC(random_state=42)))  # assuming best_params_svm contains the best parameters for SVM
level0.append(('gb', GradientBoostingClassifier(**best_params_gb)))  # assuming best_params_gb contains the best parameters for Gradient Boosting

# meta learner model
level1 = LogisticRegression()  # you can also tune the parameters for the meta learner
stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
stacking_model.fit(X_train_scaled, y_train)

# Predict the test set results
y_pred_stacking = stacking_model.predict(X_test_scaled)

# Calculate the accuracy of the model
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_accuracy
