# SML Pratical

Music Genre Classification


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [None]:
# Load the training data and the test inputs
X_train = pd.read_csv('X_train.csv', index_col = 0, header=[0, 1, 2]) # inputs of the training set
y_train = pd.read_csv('y_train.csv', index_col = 0).squeeze('columns') # outputs of the training set
X_test = pd.read_csv('X_test.csv', index_col = 0, header=[0, 1, 2]) # inputs of the test set

In [42]:
class_label_mapping = {}

def transform_labels_to_numbers(labels):
    unique_labels = set(labels)
    
    for i, label in enumerate(unique_labels):
        class_label_mapping[label] = i

    transformed_labels = [class_label_mapping[label] for label in labels]
    
    return transformed_labels, class_label_mapping

y_train, label_mapping = transform_labels_to_numbers(y_train)

In [None]:
# total number of rows and columns(attributes)
n, p = np.shape(X_train)
# Entries (i,j) correspond to the j'th dimension of the observation i
X_train

In [None]:
print(f"unique features: {X_train.columns.get_level_values('feature').unique().tolist()}")
print(f"statistics used: {X_train.columns.get_level_values('statistics').unique().tolist()}")

In [None]:
#  (may not be useful) plot correlations for each set of statistics

statistics = X_train.columns.get_level_values('statistics').unique()

for statistic in statistics:
    # obtain the columns for each feature
    cols = [col for col in X_train if col[1] == statistic]
    # find the correlation matrix
    corr = X_train[cols].corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(16, 11))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(0, 25, as_cmap=True, s = 90, l = 45, n = 5)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title(f'Correlation Heatmap (for {statistic})', fontsize = 25)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)

In [None]:
#y_train contains the true class:  Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock
classes = np.unique(y_train)
classes

In [None]:
# X_test is the array of test inputs, of the same format as X_train. The objective is to predict the class (Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock) of the output
np.shape(X_test)

## Pre-processing and dimension reduction

In [None]:
from sklearn import preprocessing

### normalise the training set and test set together ###
X = pd.concat([X_train, X_test], ignore_index=True)
scaler = preprocessing.MinMaxScaler()
X_scaled = pd.DataFrame(data=scaler.fit_transform(X), columns=X.columns)
X_train_scaled = X_scaled.iloc[:6000,:]
X_test_scaled = X_scaled.iloc[6000:, :]

In [None]:
### use PCA to reduce the dimension ###
from sklearn.decomposition import PCA

pca = PCA(n_components=p)
# find the principal compoennts
pc = pd.DataFrame(data = pca.fit_transform(X_train_scaled), columns = [f'PC {i}' for i in range(1, p+1)])

# concatenate labels 
Df_PCA = pd.concat([pc, y_train], axis=1)

explained_variances = pca.explained_variance_ratio_


In [None]:
plt.plot(range(1, 101), explained_variances[:100])
plt.title('explained variances by principal components')
plt.xlabel('PC index')
plt.ylabel('ratio of explained variance')


In [None]:
n_PCA = 20

elbow method: take around 20 PCs as features

In [None]:
# plot the first two principal components (useless plot, messy)

plt.figure(figsize = (16, 9))
sns.scatterplot(x='PC 1', y='PC 2', hue=Df_PCA['Genre'], data=Df_PCA.iloc[:, :2], alpha=0.5)


plt.title('Plot of first two components, with the genre represented by colour', fontsize=17)
plt.xlabel('first principal component', fontsize=14)
plt.xlabel('second principal component', fontsize=14)

## Training Models

In [40]:
### Try Various Machien Learning Algorithms ###
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance

In [46]:
X_t, X_val, y_t, y_val = train_test_split(X_train_scaled, y_train, test_size=0.3, random_state=1)

## use PCA to reduce dimension. n = 20
pca = PCA(n_components=n_PCA)
X_t_PC = pd.DataFrame(data = pca.fit_transform(X_t), columns = [f'PC {i}' for i in range(1, n_PCA+1)])
pca = PCA(n_components=n_PCA)
X_val_PC = pd.DataFrame(data = pca.fit_transform(X_val), columns = [f'PC {i}' for i in range(1, n_PCA+1)])

In [47]:
def model_PCA(model, name):
    model.fit(X_t_PC, y_t)
    y_pred = model.predict(X_val_PC)
    print('Validation Accuracy', name, ':', round(accuracy_score(y_val, y_pred), 5), '\n')

In [48]:
## casual trainings with no tuning
nb =  GaussianNB()
sgd = SGDClassifier(max_iter=4000)
tree = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=500, max_depth=20)
svm = SVC(decision_function_shape="ovo")
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200, 10), random_state=1)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.04)
xgbrf = XGBRFClassifier(objective= 'multi:softmax')

algorithms = {
    'naive_Bayes': nb,
    'SGD' : sgd,
    'Decision_tree': tree,
    'random_forest': rf,
    'SVM': svm,
    'logistic_regression': lg,
    'neural network': nn,
    'cross-gradient boosting tree': xgb,
    'cross-gradient boosting': xgbrf
}

for name, algorithm in algorithms.items():
    model_PCA(algorithm, name)

Validation Accuracy naive_Bayes : 0.26778 

Validation Accuracy SGD : 0.18611 

Validation Accuracy Decision_tree : 0.16611 

Validation Accuracy random_forest : 0.22778 

Validation Accuracy SVM : 0.23167 

Validation Accuracy logistic_regression : 0.20778 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Validation Accuracy neural network : 0.19889 

Validation Accuracy cross-gradient boosting tree : 0.23222 

Validation Accuracy cross-gradient boosting : 0.24167 



## LDA

In [49]:
### Use LDA to reduce dimension instead ###
LDAclassifier = LinearDiscriminantAnalysis(n_components=7) # 7 classes in total 
LDAclassifier.fit(X_t, y_t)
y_pred = LDAclassifier.predict(X_val)
print('Validation Accuracy', ':', round(accuracy_score(y_val, y_pred), 5), '\n')

Validation Accuracy : 0.54833 



In [50]:
def model_LDA(model, name):
    """training based on LDA for dimension-reduction"""
    # obtain LDA components for other algorithms
    LDA = LinearDiscriminantAnalysis()
    LDA.fit(X_t, y_t)
    X_t_LDA = LDA.transform(X_t)
    X_val_LDA = LDA.transform(X_val)      
    model.fit(X_t_LDA, y_t)
    y_pred = model.predict(X_val_LDA)
    print('Validation Accuracy', name, ':', round(accuracy_score(y_val, y_pred), 5), '\n')

In [51]:
for name, algorithm in algorithms.items():
    model_LDA(algorithm, name)

Validation Accuracy naive_Bayes : 0.55833 

Validation Accuracy SGD : 0.53278 

Validation Accuracy Decision_tree : 0.44833 

Validation Accuracy random_forest : 0.55167 

Validation Accuracy SVM : 0.55722 

Validation Accuracy logistic_regression : 0.54611 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Validation Accuracy neural network : 0.50556 

Validation Accuracy cross-gradient boosting tree : 0.54556 

Validation Accuracy cross-gradient boosting : 0.54333 



Conclusion: LDA is better than PCA for dimension reduction

## Cross-validation training

In [53]:
import warnings

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning

# use LDA to reduce dimensions
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train_scaled, y_train)
X_train_LDA = LDA.transform(X_train_scaled)

In [58]:
# SGD tuning: regularisation strength, penalty and loss 
param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'loss': ['huber', 'squared_error', 'hinge', 'perceptron', 'epsilon_insensitive', 'log_loss', 'squared_hinge', 'squared_epsilon_insensitive', 'modified_huber']
}

sgd = SGDClassifier(max_iter=5000)

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)

    grid_SGD = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy')
    grid_SGD.fit(X_train_LDA, y_train)

grid_SGD.best_params_

{'alpha': 0.001, 'loss': 'log_loss', 'penalty': 'l2'}

In [59]:
grid_SGD.best_score_

0.6898333333333333

In [61]:
# Random Forest tuning: max depth, minimum leaf size, minimum node split size, number of features used in each tree
param_grid = {
    'n_estimators': [100, 250, 500, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

rf_classifier = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_LDA, y_train)

## Export in csv format 

In [None]:
# Export the predictions on the test data in csv format
prediction = pd.DataFrame(y_pred, columns=['Genre'])
prediction.index.name='Id'
prediction.to_csv('myprediction.csv') # export to csv file

# The csv file should be of the form
#Id, Genre
#0, Folk
#1, Hip-Hop
#2, International
#...
#1998, Experimental
#1999, Pop