# SML Pratical

Music Genre Classification


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [3]:
# Load the training data and the test inputs
X_train = pd.read_csv('data/X_train.csv', index_col = 0, header=[0, 1, 2]) # inputs of the training set
y_train = pd.read_csv('data/y_train.csv', index_col = 0).squeeze('columns') # outputs of the training set
X_test = pd.read_csv('data/X_test.csv', index_col = 0, header=[0, 1, 2]) # inputs of the test set

In [4]:
class_label_mapping = {}

def transform_labels_to_numbers(labels):
    unique_labels = set(labels)
    
    for i, label in enumerate(unique_labels):
        class_label_mapping[label] = i

    transformed_labels = [class_label_mapping[label] for label in labels]
    
    return transformed_labels, class_label_mapping

y_train, label_mapping = transform_labels_to_numbers(y_train)
y_train = pd.Series(y_train)

In [5]:
# total number of rows and columns(attributes)
n, p = np.shape(X_train)
# Entries (i,j) correspond to the j'th dimension of the observation i
X_train

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
Id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
0,-0.266585,-0.984668,-0.729823,-0.895122,2.138628,0.935209,0.104089,-0.698659,-0.736408,-0.334376,...,0.065003,0.016522,0.015776,5.743597,0.307617,0.051370,0.042480,0.002441,1.976972,0.034533
1,-0.180061,0.260884,-0.069373,0.208734,-0.078855,-0.577818,0.583788,0.143781,0.291556,0.007314,...,0.087692,0.016355,0.016605,64.870987,0.812988,0.082784,0.069824,0.003906,7.374503,0.074870
2,-0.692900,0.356662,0.062617,0.248280,3.470037,0.166613,0.823874,0.181112,0.551939,0.357985,...,0.132387,0.025847,0.023922,34.251705,0.850098,0.058200,0.036621,0.010254,5.927942,0.117603
3,0.243339,0.214182,-0.049026,1.456255,-0.360826,-0.875256,-0.770200,0.315500,0.789956,0.448319,...,0.071478,0.019166,0.025535,1.364990,0.342285,0.081713,0.075195,0.000000,1.100437,0.041754
4,-0.968576,0.309255,0.223164,0.160960,0.919838,-0.111985,-1.012521,-0.665692,-0.316646,-0.264381,...,0.106220,0.023536,0.019742,3.589230,0.322266,0.073736,0.069336,0.004395,1.210593,0.036459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.503490,-0.540720,-0.690117,-0.107338,-0.647856,-0.681969,-0.246245,-0.546552,0.062783,0.070393,...,0.084929,0.017250,0.020335,4.868783,0.668945,0.076452,0.044434,0.001465,2.045856,0.084214
5996,-0.600597,0.406386,-0.748409,-0.316157,-0.507428,-0.054214,-0.476804,-0.373120,-0.930158,-1.080690,...,0.075407,0.014998,0.020683,7.893681,0.584961,0.076210,0.048340,0.000000,2.561808,0.073010
5997,-1.014298,-0.950744,0.618304,0.204298,-0.788411,-0.794254,-0.586847,0.099172,-0.313476,-0.523417,...,0.138591,0.024969,0.023658,27.257378,0.373047,0.042598,0.037598,0.000000,3.778109,0.027813
5998,-0.002938,0.646034,-0.732819,1.205990,-0.898733,-0.684953,0.134642,-0.374792,-0.019524,-1.016032,...,0.137695,0.030371,0.029970,431.200500,0.384277,0.025731,0.025391,0.008301,10.260160,0.006870


In [6]:
print(f"unique features: {X_train.columns.get_level_values('feature').unique().tolist()}")
print(f"statistics used: {X_train.columns.get_level_values('statistics').unique().tolist()}")

unique features: ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse', 'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast', 'spectral_rolloff', 'tonnetz', 'zcr']
statistics used: ['kurtosis', 'max', 'mean', 'median', 'min', 'skew', 'std']


In [None]:
#  (may not be useful) plot correlations for each set of statistics

statistics = X_train.columns.get_level_values('statistics').unique()

for statistic in statistics:
    # obtain the columns for each feature
    cols = [col for col in X_train if col[1] == statistic]
    # find the correlation matrix
    corr = X_train[cols].corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(16, 11))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(0, 25, as_cmap=True, s = 90, l = 45, n = 5)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title(f'Correlation Heatmap (for {statistic})', fontsize = 25)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)

In [7]:
#y_train contains the true class:  Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock
classes = np.unique(y_train)
classes

array([0, 1, 2, 3, 4, 5, 6, 7])

In [8]:
# X_test is the array of test inputs, of the same format as X_train. The objective is to predict the class (Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock) of the output
np.shape(X_test)

(2000, 518)

In [9]:
from sklearn import preprocessing

### normalise the training set and test set together ###
X = pd.concat([X_train, X_test], ignore_index=True)
scaler = preprocessing.MinMaxScaler()
X_scaled = pd.DataFrame(data=scaler.fit_transform(X), columns=X.columns)
X_train_scaled = X_scaled.iloc[:6000,:]
X_test_scaled = X_scaled.iloc[6000:, :]

## PCA for dimension reduction

In [None]:
### use PCA to reduce the dimension ###
from sklearn.decomposition import PCA

pca = PCA(n_components=p)
# find the principal compoennts
pc = pd.DataFrame(data = pca.fit_transform(X_train_scaled), columns = [f'PC {i}' for i in range(1, p+1)])

# concatenate labels 
Df_PCA = pd.concat([pc, y_train], axis=1)

explained_variances = pca.explained_variance_ratio_


In [None]:
plt.plot(range(1, 101), explained_variances[:100])
plt.title('explained variances by principal components')
plt.xlabel('PC index')
plt.ylabel('ratio of explained variance')

In [None]:
n_PCA = 20

elbow method: take around 20 PCs as features

In [None]:
# plot the first two principal components (useless plot, messy)

plt.figure(figsize = (16, 9))
sns.scatterplot(x='PC 1', y='PC 2', hue=Df_PCA['Genre'], data=Df_PCA.iloc[:, :2], alpha=0.5)


plt.title('Plot of first two components, with the genre represented by colour', fontsize=17)
plt.xlabel('first principal component', fontsize=14)
plt.xlabel('second principal component', fontsize=14)

## Classical Training Models
Naive Bayes, two-layer perceptron, linear SVM, kernel SVM, random forests (and with gradient boosting) 

In [10]:
### Try Various Machien Learning Algorithms ###
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance

In [None]:
### casual trainings with no tuning ###
nb =  GaussianNB()
sgd = SGDClassifier(max_iter=4000)
tree = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=500, max_depth=20)
svm = SVC(decision_function_shape="ovo")
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200, 10), random_state=1)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.04)
xgbrf = XGBRFClassifier(objective= 'multi:softmax')

algorithms = {
    'naive_Bayes': nb,
    'SGD' : sgd,
    'Decision_tree': tree,
    'random_forest': rf,
    'SVM': svm,
    'logistic_regression': lg,
    'neural network': nn,
    'cross-gradient boosting tree': xgb,
    'cross-gradient boosting': xgbrf
}

### PCA for dimension-reduction

In [None]:
X_t, X_val, y_t, y_val = train_test_split(X_train_scaled, y_train, test_size=0.3, random_state=1)

## use PCA to reduce dimension. n = 20
pca = PCA(n_components=n_PCA)
X_t_PC = pd.DataFrame(data = pca.fit_transform(X_t), columns = [f'PC {i}' for i in range(1, n_PCA+1)])
pca = PCA(n_components=n_PCA)
X_val_PC = pd.DataFrame(data = pca.fit_transform(X_val), columns = [f'PC {i}' for i in range(1, n_PCA+1)])

In [None]:
def model_PCA(model, name):
    model.fit(X_t_PC, y_t)
    y_pred = model.predict(X_val_PC)
    print('Validation Accuracy', name, ':', round(accuracy_score(y_val, y_pred), 5), '\n')

In [None]:
for name, algorithm in algorithms.items():
    model_PCA(algorithm, name)

Validation Accuracy naive_Bayes : 0.26778 

Validation Accuracy SGD : 0.18611 

Validation Accuracy Decision_tree : 0.16611 

Validation Accuracy random_forest : 0.22778 

Validation Accuracy SVM : 0.23167 

Validation Accuracy logistic_regression : 0.20778 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Validation Accuracy neural network : 0.19889 

Validation Accuracy cross-gradient boosting tree : 0.23222 

Validation Accuracy cross-gradient boosting : 0.24167 



### Fisher's LDA for dimension reduction

In [18]:
### Use LDA classifier ###
LDAclassifier = LinearDiscriminantAnalysis(n_components=7) # 7 classes in total 
LDAclassifier.fit(X_t, y_t)
y_pred = LDAclassifier.predict(X_val)
print('Validation Accuracy', ':', round(accuracy_score(y_val, y_pred), 5), '\n')

NameError: name 'X_t' is not defined

In [None]:
### Use LDA to reduce dimension, and apply other algorithms ###

def model_LDA(model, name):
    """training based on LDA for dimension-reduction"""
    # obtain LDA components for other algorithms
    LDA = LinearDiscriminantAnalysis()
    LDA.fit(X_t, y_t)
    X_t_LDA = LDA.transform(X_t)
    X_val_LDA = LDA.transform(X_val)      
    model.fit(X_t_LDA, y_t)
    y_pred = model.predict(X_val_LDA)
    print('Validation Accuracy', name, ':', round(accuracy_score(y_val, y_pred), 5), '\n')

In [None]:
for name, algorithm in algorithms.items():
    model_LDA(algorithm, name)

Validation Accuracy naive_Bayes : 0.55833 

Validation Accuracy SGD : 0.53278 

Validation Accuracy Decision_tree : 0.44833 

Validation Accuracy random_forest : 0.55167 

Validation Accuracy SVM : 0.55722 

Validation Accuracy logistic_regression : 0.54611 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Validation Accuracy neural network : 0.50556 

Validation Accuracy cross-gradient boosting tree : 0.54556 

Validation Accuracy cross-gradient boosting : 0.54333 



Conclusion: LDA is better than PCA for dimension reduction

## Cross-validation training

In [19]:
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [20]:
# use LDA to reduce dimensions
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train_scaled, y_train)
X_train_LDA = LDA.transform(X_train_scaled)

In [None]:
# SGD tuning: regularisation strength, penalty and loss 
sgd = SGDClassifier(max_iter=5000)

param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'loss': list(sgd.loss_functions)
}

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)

    grid_SGD = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy')
    grid_SGD.fit(X_train_LDA, y_train)

{'alpha': 0.001, 'loss': 'log_loss', 'penalty': 'l2'}

In [None]:
print(grid_SGD.best_params_)
grid_SGD.best_score_

0.6898333333333333

In [None]:
# Random Forest tuning:  
param_grid = {
    'n_estimators': [50, 100], # number of trees
    'max_depth': [None, 5, 10],
    'min_samples_split': [8, 10, 20], # minimum size of node for splitting
    'min_samples_leaf': [2, 4, 6],  # minimum leaf size 
    'max_features': ['auto', 'sqrt', 'log2', None] # number of features used in each tree
}

rf_classifier = RandomForestClassifier()

# Perform grid search with cross-validation
grid_RF = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_RF.fit(X_train_LDA, y_train)

In [None]:
print(grid_RF.best_params_)
grid_RF.best_score_

In [None]:
## tuning logistic regression

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # regularisation strength
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg']  # solver for optimisation
}

# Create the Logistic Regression model
logreg_model = LogisticRegression()

# Perform grid search with cross-validation
grid_logit = GridSearchCV(logreg_model, param_grid, cv=5, scoring='accuracy')
grid_logit.fit(X_train_LDA, y_train)

In [None]:
print(grid_logit.best_params_)
grid_logit.best_score_

In [None]:
# tunig gradient boosting random forest

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2], # step size shrinkage used in each boosting iteration
    'n_estimators': [50, 100], # number of trees
    'max_depth': [5, 7, 10], # max tree depth
    'subsample': [0.6, 0.8], # fraction of samples used for fitting each tree
    'colsample_bynode': [0.6, 0.8], # fraction of features used for fitting each tree
    'gamma': [0, 0.1, 0.2], # Minimum loss reduction required to make a further partition on a leaf node of the tree.
}

# Create the XGBRFClassifier
xgbrf_classifier = XGBRFClassifier(random_state=42)

# Perform grid search with cross-validation
grid_xgb = GridSearchCV(xgbrf_classifier, param_grid, cv=5, scoring='accuracy')
grid_xgb.fit(X_train_LDA, y_train)

In [None]:
print(grid_xgb.best_params_)
grid_xgb.best_score_

In [None]:
## tuning Adaboost
from sklearn.ensemble import AdaBoostClassifier

base_estimator = DecisionTreeClassifier()

# Define the parameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200], # number of trees
    'learning_rate': [0.01, 0.1, 0.5],
    'base_estimator__max_depth': [2, 4, 6] 
}

# Create the AdaBoostClassifier
adaboost = AdaBoostClassifier(base_estimator=base_estimator)

# grid search
grid_ada = GridSearchCV(adaboost, param_grid, cv=5, scoring='accuracy')
grid_ada.fit(X_train, y_train)

In [None]:
print(grid_ada.best_params_)
grid_ada.best_score_

## MLP training using tensorflow

In [12]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [87]:
# use LDA to reduce dimensions
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train_scaled, y_train)
X_train_LDA = LDA.transform(X_train_scaled)
X_t, X_val, y_t, y_val = train_test_split(X_train_LDA, y_train, test_size=0.3, random_state=1)

In [89]:
# define architecture 
model = models.Sequential([
    layers.InputLayer(input_shape=(X_t.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(8, activation='softmax')  # Output layer with 8 neurons for classification
])

# specify the way of training
model.compile(optimizer='adam', # 'adam', 'sgd', 'rmsprop'
              loss='sparse_categorical_crossentropy',  #  'sparse_categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_t, y_t, epochs=100, batch_size=32, validation_data=(X_val, y_val))

test_loss, test_accuracy = model.evaluate(X_val, y_val)
print(f'approximate Test Accuracy: {test_accuracy * 100:.2f}%')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Superlearner
Ensemble method of the learners described above

In [11]:
from sklearn.model_selection import KFold

In [48]:
# create a list of base-models
classifiers = {}
classifiers['SGD_LM'] = SGDClassifier(max_iter=5000, alpha=0.001, loss='log_loss', penalty='l2')
classifiers['logistic'] = LogisticRegression(C=1, penalty='l1', solver='liblinear')
classifiers['Gradient_Boost'] = XGBRFClassifier(colsample_bynode=0.6, gamma=0, learning_rate=0.01, max_depth=7, n_estimators=100, subsample=0.6)

In [52]:
# collect out of fold predictions form k-fold cross validation
def out_of_fold_predictions(X, y, models):
	"""
	param X: input data
	param y: class labels
	models: based models for the super-learner
	"""
	X_out, y_out = list(), list()
	# define partition of data into 10 folds
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ids, test_ids in kfold.split(X):
		y_preds = list()
		# split data
		X = np.array(X)
		X_train, X_test = X[train_ids], X[test_ids]
		y = np.array(y)
		y_train, y_test = y[train_ids], y[test_ids]
		y_out.extend(y_test)
		# fit and make predictions with each base model
		for _, model in models.items():
			model.fit(X_train, y_train)
			y_probs = model.predict_proba(X_test)
			# store columns
			y_preds.append(y_probs)
		# store fold yhats as columns
		X_out.append(np.hstack(y_preds))
	return np.vstack(X_out), np.asarray(y_out)

In [50]:
def fit_base(X, y, models):
	for name, model in models.items():
		if name != 'MLP':
			model.fit(X, y)

## used on out-of-fold predictions
def fit_meta(X, y):
    model = LogisticRegression(solver='liblinear')
    model.fit(X, y)
    return model

def superlearner_predict(X, models, meta_model):
	meta_X = list()
	for _, model in models.items():
		y_probs = model.predict_proba(X)
		meta_X.append(y_probs)
	meta_X = np.hstack(meta_X)
	return meta_model.predict(meta_X)

In [53]:
# use LDA to reduce dimensions
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train_scaled, y_train)
X_train_LDA = LDA.transform(X_train_scaled)
# train-test split
X_t, X_val, y_t, y_val = train_test_split(X_train_LDA, y_train, test_size=0.3, random_state=1)

meta_X, meta_y = out_of_fold_predictions(X_t, y_t, classifiers)

fit_base(X_t, y_t, classifiers)
meta_model = fit_meta(meta_X, meta_y)

y_pred = superlearner_predict(X_val, classifiers, meta_model)
print('Super Learner test accuracy: %.3f' % (accuracy_score(y_val, y_pred) * 100))

Super Learner test accuracy: 69.222


## Export in csv format 

In [None]:
# Export the predictions on the test data in csv format
prediction = pd.DataFrame(y_pred, columns=['Genre'])
prediction.index.name='Id'
prediction.to_csv('myprediction.csv') # export to csv file

# The csv file should be of the form
#Id, Genre
#0, Folk
#1, Hip-Hop
#2, International
#...
#1998, Experimental
#1999, Pop