In [1]:
import pandas as pd
data = pd.read_csv('dataset.csv')

# Preprocessing

In [2]:
# Remove columns that are irrelevant to the model
data = data.drop(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity','Unnamed: 0'], axis=1)
# remove time_signature, it's mostly 4/4 in this dataset
data = data.drop(['time_signature'], axis=1)

In [3]:
# encode class targets
from sklearn.preprocessing import LabelEncoder
labEnc = LabelEncoder()
y = data['track_genre']
y = labEnc.fit_transform(y)

In [4]:
# set up traing and test sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score
X = data.drop('track_genre', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Use Standard Scaler to standardize the numerical features. It removes the mean and scales each feature/variable to unit variance.
# z-score normalization
# transfroms data to have mean of 0 and standard deviation of 1
from sklearn.preprocessing import StandardScaler
continuous_features = ['duration_ms', 'danceability', 'energy', 'loudness', 
                       'speechiness', 'acousticness', 'instrumentalness', 
                       'liveness', 'valence', 'tempo' ]

scaler = StandardScaler()
scaler.fit(X_train[continuous_features])
X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

The categorical features are already encoded as integers. Explicit and mode are already coded to 0 and 1. Key is an integer between 1 and 11

# Baselines

Random Guess baseline:

In [30]:
genre_counts = 114
random_guessing_accuracy = 1 / genre_counts
random_guessing_accuracy

0.008771929824561403

Dummy Classifier baseline:

In [36]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy, f1

(0.008640350877192983, 0.008640350877192983)

Random guess accruacy of 0.88% is very low, which is expected due to the large number of genres.

Decision Tree baseline:

In [40]:
# Decision Tree Classifier Baseline Model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy, f1

(0.1676315789473684, 0.16952590372637405)

Baseline accuracies:
- Random Guess: 0.88%
- Dummy Classifier: 0.86%
- Decision Tree: 16.8%

# Tuned Decision Tree

In [13]:
# search for best parameters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'max_depth': list(range(10, 51, 10)),
    'min_samples_split': list(range(10, 101, 20)),
}
random_search = RandomizedSearchCV(dt, param_grid, n_iter=25, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
best_params_dt = random_search.best_params_
best_params_dt

{'min_samples_split': 70, 'max_depth': 40}

In [39]:
# Train and test with tuned parameters from RandomizedSearchCV
best_params_dt = {'min_samples_split': 70, 'max_depth': 40}
dt = DecisionTreeClassifier(**best_params_dt, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy, f1

(0.18140350877192982, 0.17315301822343562)

Decision Tree Classifier model accuracy: 18.1%

# MLP CLassifier

In [33]:
# Train a NN classifier for X to classify genre (y)
# one hidden layer with # of neurons = # of features
from sklearn.neural_network import MLPClassifier

# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=100, random_state=42, early_stopping=True)
# mlp = MLPClassifier(hidden_layer_sizes=(26, 26), max_iter=1000, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(13,), max_iter=100, random_state=42, early_stopping=True)
mlp.fit(X_train_scaled, y_train)

y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.1612719298245614

In [16]:
# two hidden layers, each with # of neurons = # of features
mlp = MLPClassifier(hidden_layer_sizes=(13,13), max_iter=100, random_state=42, early_stopping=True)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.16912280701754387

In [20]:
# two hidden layers, 26 and 13 nodes each
mlp = MLPClassifier(hidden_layer_sizes=(26,13), max_iter=100, random_state=42, early_stopping=True)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.17456140350877192

In [22]:
# two hidden layers, 26 and 13 nodes each
# larger # of iterations
mlp = MLPClassifier(hidden_layer_sizes=(26,13), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.19478070175438597

In [24]:
# two hidden layers, 26 and 13 nodes each
# larger # of iterations
mlp = MLPClassifier(hidden_layer_sizes=(26,13), max_iter=750, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.19478070175438597

In [34]:
# two hidden layers of 100 neurons each, and train for 500 epochs
mlp = MLPClassifier(hidden_layer_sizes=(100,100), max_iter=500, random_state=42, early_stopping=True)
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred)
mlp_accuracy

0.21653508771929825

Nueral Network Model yielded an accuracy of 22.36%. This is an improvement over the tuned Decistion Tree model.  
Next step is to tune the Nueral Network model.

In [None]:
# Don't execute this block, takes too long
# Search for best parameters for NN model with GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(13,), (13, 13), (26,13), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 1],
    'learning_rate': ['constant', 'adaptive'],
}
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_

GridSearchCV was stopped after an hour. Next we'll try RandomizedSearchCV with

In [31]:
# number of features in training data
n_features = X_train.shape[1]
n_features

13

In [26]:
# Tune paramters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'hidden_layer_sizes': [(13,), (13, 13), (26,13), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 1],
}

mlp = MLPClassifier(max_iter=500, random_state=42)
random_search = RandomizedSearchCV(mlp, param_grid, n_iter=24, cv=3, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)
best_params_mlp = random_search.best_params_
best_params_mlp

{'solver': 'adam',
 'hidden_layer_sizes': (100, 100),
 'alpha': 0.01,
 'activation': 'logistic'}

In [41]:
# Train and evaluate the classifier with the best parameters
best_params_mlp = {'solver': 'adam',
    'hidden_layer_sizes': (100, 100),
    'alpha': 0.01,
    'activation': 'logistic'}
mlp_best = MLPClassifier(**best_params_mlp, max_iter=500, random_state=42)
mlp_best.fit(X_train_scaled, y_train)
y_pred_best = mlp_best.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best, average='weighted')
accuracy, f1

(0.22416666666666665, 0.2055199126405463)

MLP model accuracy: 22.4%

# Random Forest Classifier

In [43]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_accuracy

0.2567982456140351

In [None]:
# Tune parameters with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_grid_rf = {
    'n_estimators': [5, 10, 13, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}
random_search_rf = RandomizedSearchCV(rf_clf, param_grid_rf, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1, random_state=42, verbose=2)
random_search_rf.fit(X_test_scaled, y_train)
best_params_rf = random_search_rf.best_params_
best_params_rf
# 9 min 49s runtime

In [19]:
best_params_rf = {'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None}
best_params_rf


{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None}

In [46]:
# Tuned Random Forest Model
best_params_rf = {'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None}
rf_clf_best = RandomForestClassifier(**best_params_rf, random_state=42)
rf_clf_best.fit(X_train, y_train)
y_pred_rf_best = rf_clf_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf_best)
f1 = f1_score(y_test, y_pred_rf_best, average='weighted')
accuracy, f1

(0.2604385964912281, 0.24891698518745176)

Random Forest model accuracy: 26%

# Gradient Boosting Classifier

In [None]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train_scaled, y_train)
y_pred_gb = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_accuracy
# 0.216

Gradient Boosting model accuracy: 21.6%

# SVM Model

In [36]:
# SVM Model
from sklearn import svm
from sklearn.preprocessing import StandardScaler
svm_clf = svm.LinearSVC(random_state=42)
svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_accuracy



0.14619298245614035

SVM model accuracy: 14.6%

# Logisitic Regression

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
log_reg_clf = LogisticRegression(random_state=42)
log_reg_clf.fit(X_train, y_train)
y_pred_log_reg = log_reg_clf.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_accuracy
# 0.1679

In [38]:
# LogisticRegression with higher max_iter
log_reg_clf = LogisticRegression(max_iter=1000, random_state=42)
log_reg_clf.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg_clf.predict(X_test_scaled)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_accuracy

0.16785964912280701

Logistic Regression model accuracy: 16.8%

# Ensemble Stacking Model

In [None]:
from sklearn.ensemble import StackingClassifier

# base models
level0 = list()
level0.append(('lr', LogisticRegression(max_iter=100000)))
level0.append(('rf', RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=30)))
level0.append(('svm', svm.LinearSVC()))

# meta learner model
level1 = LogisticRegression()
stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
stacking_model.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_model.predict(X_test_scaled)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_accuracy
# accuracy: 0.2718

27.18% accuracy for Ensemble stack of Logistic Regression, Random Forest, and SVM models.

In [47]:
# Stacking Model Random Forest, Decision Tree, and MLP

# imports and paramters defined here for convenience
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
best_params_rf = {'n_estimators': 200,
    'min_samples_split': 10,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': None}
best_params_dt = {'min_samples_split': 70, 'max_depth': 40}
best_params_mlp = {'solver': 'adam',
    'hidden_layer_sizes': (100, 100),
    'alpha': 0.01,
    'activation': 'logistic'}

# base models
level0 = list()
level0.append(('rf', RandomForestClassifier(**best_params_rf, random_state=42)))
level0.append(('dt', DecisionTreeClassifier(**best_params_dt, random_state=42)))
level0.append(('mlp', MLPClassifier(**best_params_mlp, max_iter=500, random_state=42)))

# meta learner model
level1 = LogisticRegression()

# stacking model
stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
stacking_model.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_model.predict(X_test_scaled)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
f1 = f1_score(y_test, y_pred_stacking, average='weighted')
stacking_accuracy, f1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.3377631578947368, 0.33274350568541206)

33.8% accuracy for Ensemble stack of DecisionTreeClassifier, RandomForestClassifier, MLPclassifier.