# Music Genre Classification using Machine Learning

 Use machine learning to train a model to classify audio tracks into genres. According to their “research” a machine learning model should be able to label the tracks into genres more accurately.

In [1]:
# import visualisation libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import os



In [2]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/Machine_learning')
print(os.getcwd())

ModuleNotFoundError: No module named 'google'

ModuleNotFoundError: No module named 'google'

# Read and Explore the Data

In [None]:
# Read and preview data
songs_data = pd.read_csv('Data/songs_data.csv')
songs_data.head()

In [None]:
# shape of the data
songs_data.shape

In [None]:
songs_data.info()

In [None]:
# summary statistics for all the data

songs_data.describe()

In [None]:
def identify_low_variance_features(df, std_threshold):
    """
    This definition can be used to identify features with low varaince

    @param df pandas DataFrame
    @param std_threshold int

    @return a list of features that have low variance
    """

    std_df = pd.DataFrame(df.std()).rename(columns = {0: 'standard_deviation'})

    low_var_features = list(std_df[std_df['standard_deviation'] < std_threshold].index)

    print("number of low variance features:", len(low_var_features))
    print("low variance features:", low_var_features)

    return low_var_features

In [None]:
# identify features with low std
identify_low_variance_features(songs_data, std_threshold=0.01)


## Handling Missing Data

In [None]:
# Identify missing data
songs_data.isnull().sum()

In [None]:
# Check which rows are missing in 'track_artist' column
songs_data[songs_data.track_artist.isnull()]

In [None]:
# Check which rows are missing in 'track_name' column
songs_data[songs_data.track_name.isnull()]

In [None]:
# drop rows with missing data

songs_data = songs_data.dropna(inplace=False)

## Handling Duplicates

In [None]:
# check if the data has any duplicates

songs_data[songs_data.duplicated(keep=False)]

# this shows two rows that are duplicated

In [None]:
# going to drop the first appearance of duplicated data

print("shape of data before dropping, with the duplicates:", songs_data.shape)

songs_data_cleaned = songs_data[~songs_data.duplicated()]

print("shape of data after dropping the duplictaes:", songs_data_cleaned.shape)
# we dropped one row of data

In [None]:
# Check if it worked
songs_data_cleaned[songs_data_cleaned.duplicated(keep=False)]


# Feature Engineering

We need to encode the types of genres to numbers

In [None]:
# Check which are the different genres we have
songs_data_cleaned.genre.unique()

In [None]:
# label_encoder object knows how to understand word labels.

label_encoder = preprocessing.LabelEncoder()

songs_data_cleaned['genre_encode']= label_encoder.fit_transform(songs_data_cleaned['genre'])

songs_data_cleaned['genre_encode'].unique()

In [None]:
# Check if it worked
songs_data_cleaned

In [None]:
# Check the distribution of the target

sns.distplot(songs_data_cleaned.genre_encode);


### Exploring the data after cleaned

In [None]:
# Correlation matrix
df = songs_data_cleaned
plt.figure(figsize=(20,12))
sns.heatmap(songs_data_cleaned.corr(numeric_only=True),annot=True, linewidth=2)
plt.tight_layout()

In [None]:
def identify_highly_correlated_features(df, correlation_threshold):
    """
    This definition can be used to identify highly correlated features

    @param df pandas DataFrame
    @param correlation_threshold int

    @return a DataFrame with highly correlated features
    """

    corr_matrix = df.corr(numeric_only = True).abs() # calculate the correlation matrix with
    high_corr_var = np.where(corr_matrix >= correlation_threshold) # identify variables that have correlations above defined threshold
    high_corr_var = [(corr_matrix.index[x], corr_matrix.columns[y], round(corr_matrix.iloc[x, y], 2))
                         for x, y in zip(*high_corr_var) if x != y and x < y] # identify pairs of highly correlated variables

    high_corr_var_df = pd.DataFrame(high_corr_var).rename(columns = {0: 'corr_feature',
                                                                     1: 'drop_feature',
                                                                     2: 'correlation_values'})


    if high_corr_var_df.empty:
        high_corr_var_df
    else:
        high_corr_var_df = high_corr_var_df.sort_values(by = 'correlation_values', ascending = False)

    return high_corr_var_df


In [None]:
identify_highly_correlated_features(songs_data_cleaned, correlation_threshold=0.9)


### @@@@@Selecting the feature for the ML model

In [None]:
# Drop the columns which are least correlated
df_cleaned = df.copy()

df_cleaned = df_cleaned.drop(['key', 'mode', 'liveness'], axis=1)


In [None]:
df_cleaned

In [None]:
# Check the distribution of the features & the target
df_cleaned.hist(figsize=(10, 10));

### Looking for outliers

In [None]:
# Specify the columns for box plots
columns = ['track_popularity','danceability', 'energy', 'loudness', 'speechiness',
           'acousticness', 'instrumentalness', 'valence', 'tempo',
           'duration_ms']

# Create subplots for each column
fig, axs = plt.subplots(nrows=len(columns), figsize=(10, 50))

# Generate box plots for each column
for i, column in enumerate(columns):
    sns.boxplot(x=df_cleaned[column], color='red', ax=axs[i])
    axs[i].set_title(column)

plt.tight_layout()
plt.show()


In [None]:
lista=['track_popularity','danceability', 'energy', 'loudness', 'speechiness',
           'acousticness', 'instrumentalness', 'valence','tempo',
           'duration_ms', 'genre_encode']
for column in lista:
    q1 = np.nanpercentile(df_cleaned[column], 25)
    q3 = np.nanpercentile(df_cleaned[column], 75)

    iqr = q3 - q1
    UB = q3 + 1.5 * iqr
    LB = q1 - 1.5 * iqr

    print("Column:", column)
    print("25th percentile:", q1)
    print("75th percentile:", q3)
    print("IQR:", iqr)
    print("Upper Bound (UB):", UB)
    print("Lower Bound (LB):", LB)

    outliers = df_cleaned[(df_cleaned[column] > UB) | (df_cleaned[column] < LB)]
    num_outliers = outliers.shape[0]

    print("Values above", UB, "and values below", LB, "are outliers")
    print("Number of outliers:", num_outliers)
    print()  # Adding an empty line for readability

## Building the ML Model

In [None]:
# define our feature and target
target = df_cleaned.genre_encode
features = df_cleaned.drop(['genre','genre_encode','track_id','track_name','track_artist'], axis = 1)

#### Scale of the features

In [None]:
scaler = MinMaxScaler()
scaler.fit(features)
features_scaled = scaler.transform(features)

features_scaled

In [None]:
df_scaled = pd.DataFrame(features_scaled)
df_scaled

In [None]:
# the shape of the features data frame and the scaled features are the same
print(features.shape)
print(features_scaled.shape)

# Random Forest Model

In [None]:
# split the song data into training and test data

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(features_scaled,
                                                                  target,
                                                                  test_size = 0.3,
                                                                  random_state = 8)

In [None]:
# print the shape of the training data

print("Training Data")
print("Shape of X_train", X_train_scaled.shape)
print("Shape of y_train", y_train.shape)

In [None]:
type(X_train_scaled)

In [None]:
# Import and Fit the Random Forest Model

clf_rf = RandomForestClassifier()

clf_rf.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on unseen data

y_pred_rf = clf_rf.predict(X_test_scaled)
y_pred_rf

In [None]:
print("first five predicted values:", y_pred_rf[0:5])
print("first five actual values:", list(y_test[0:5]))

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("accuracy:", round(accuracy_score(y_test, y_pred_rf), 2))
print("recall:", round(recall_score(y_test, y_pred_rf, average='macro'), 2))
print("precision:", round(precision_score(y_test, y_pred_rf, average='macro'), 2))
print("f1-score:", round(f1_score(y_test, y_pred_rf, average='macro'), 2))


In [None]:
# plot confusion matrix

cm = confusion_matrix(y_test, y_pred_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=clf_rf.classes_)

disp.plot()
plt.show()

In [None]:
# print classification report

print(classification_report(y_test, y_pred_rf))


In [None]:
# Get feature importances
importances = clf_rf.feature_importances_

# Create a DataFrame to store feature names and importances
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error
from matplotlib.legend_handler import HandlerLine2D

train_results = []
test_results = []
list_nb_estimators = [5, 10, 15, 30, 45, 60, 80, 100]

for nb_estimators in list_nb_estimators:
    clf_rf = RandomForestClassifier(n_estimators=nb_estimators)
    clf_rf.fit(X_train_scaled, y_train)

    train_results.append(mean_squared_error(y_train, clf_rf.predict(X_train_scaled)))
    test_results.append(mean_squared_error(y_test, clf_rf.predict(X_test_scaled)))

line1, = plt.plot(list_nb_estimators, train_results, color="r", label="Training Score")
line2, = plt.plot(list_nb_estimators, test_results, color="g", label="Testing Score")

plt.legend(handler_map={line1: HandlerLine2D(numpoints=1)})
plt.ylabel('MSE')
plt.xlabel('n_estimators')
plt.show()


# XGBoost Model

In [None]:
# Import the required libraries
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Create an instance of the XGBoost classifier
clf_xgb = XGBClassifier()

# Fit the classifier on the training data
clf_xgb.fit(X_train_scaled, y_train)

# Make predictions on unseen data
y_pred_xgb = clf_xgb.predict(X_test_scaled)

# Print the predicted values and actual values for the first five samples
print("First five predicted values:", y_pred_xgb[:5])
print("First five actual values:", list(y_test[:5]))

# Evaluate the performance of the model
print("Accuracy:", round(accuracy_score(y_test, y_pred_xgb), 2))
print("Recall:", round(recall_score(y_test, y_pred_xgb, average='macro'), 2))
print("Precision:", round(precision_score(y_test, y_pred_xgb, average='macro'), 2))
print("F1-Score:", round(f1_score(y_test, y_pred_xgb, average='macro'), 2))

# Plot the confusion matrix
cm = confusion_matrix(y_test, y_pred_xgb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf_xgb.classes_)
disp.plot()
plt.show()

# Print the classification report
print(classification_report(y_test, y_pred_xgb))




# Random forest with tuning

In [None]:
RandomForestClassifier(n_estimators=500)


In [None]:
params = {'n_estimators':np.arange(100,500,100),
          'max_features':np.arange(0.1,1.0,0.2)
         }