# Classification Machine Learning Methods
The aim is to classify songs by their corresponding musical genre.

### Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Dataset path and needed parameters
Be careful and change the paths before executing

In [ ]:
# Data
# Dataset for classification purposes
cdata_path = 'df_marks.csv'

# Parameters
plots = True    # In case a representation of the data inside the dataset is wanted

### Data loading

In [ ]:
# CSVs reading and training - test data generation
data = pd.read_csv(cdata_path, sep=';', decimal=",", index_col=None)
y = data.iloc[:, 0]
X = data.iloc[:, 1:6].apply(pd.to_numeric, errors='coerce')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Represent data

In [ ]:
if plots:
    color_map = {'Alternative': 'red', 'Pop': 'blue', 'Classical': 'green', 'Rock': 'purple', 'Dance': 'orange',
                 'Techno': 'brown'}
    marker_map = {'Alternative': 'o', 'Pop': 's', 'Classical': '^', 'Rock': 'v', 'Dance': 'p', 'Techno': '*'}
    for col in X.columns:
        plt.figure(figsize=(10, 8))
        plt.clf()
        for label in y.unique():
            subset = X.loc[y == label, col].round(3)
            plt.scatter(subset, y[y == label], label=f'{label}',
                        color=color_map[label], marker=marker_map[label])
        plt.title(f'{col}')
        plt.xlabel('Values')
        plt.ylabel('Genres')
        plt.legend()
        plt.savefig(f'new_data/{col}.png')

## Models
The classification models available in this project are:
1. Quadratic Discriminant Analysis (QDA)
2. Random Forest
3. Classification Tree

### Quadratic Discriminant Analysis (QDA)
In this model, the steps are as follows:
1. One-hot encoding. The classification variable is categorical and must encode in numbers
2. The QDA model is called from the sklearn.discriminant_analysis library
3. The model is trained

In [ ]:
# Result list
results = list()    # The results variables will be saved in this variable

# One-hot encode the categorical variable 
y_train_reshaped = y_train.values.reshape(-1, 1)
y_test_reshaped = y_test.values.reshape(-1, 1)
encoder = OneHotEncoder()
y_train_encoded = np.argmax(encoder.fit_transform(y_train_reshaped).toarray(), axis=1)
y_test_encoded = np.argmax(encoder.transform(y_test_reshaped).toarray(), axis=1)

# Create and train the QDA model
model = QuadraticDiscriminantAnalysis()

# Model training
model.fit(X_train, y_train_encoded)

### Quadratic Discriminant Analysis (QDA)
Continuing with the process: Once the model iis trained, it is time to evaluate it
4.  Make predictions on the test set
5. Evaluate the model and store the result
6. Plot the results

In [ ]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model and store the result
acc = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test_encoded, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_test_encoded, y_pred, average='macro', zero_division=1)

results.append({'Model': 'QDA' , 'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1_score': f1})
print(f'>acc={acc:.3f}, precision={precision:.3f}, rll={recall:.3f}, f1={f1:.3f}')

# PLot the confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=y_train.unique(), yticklabels=y_train.unique())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig(f"QDA/Confusion_Matrix.png")

### Random Forest
In this model, the steps are as follows:
1. Load the data
2. Execute the cross validation method over the data
3. Define the model and its space
4. Apply the search definition for the cross validation
5. Train the models and return the best one
6. Test the model and save the results

In [ ]:
# Result list
outer_results = list()

# Other variables
num_forests = 0

# Enumerate splits
cv_outer = KFold(n_splits=10, shuffle=True, random_state=2)
for train, test in cv_outer.split(data.iloc[:, 0:6]):
    # Split data
    X_train, X_test = X.iloc[train, 0:6], X.iloc[test, 0:6]
    y_train, y_test = y.iloc[train], y.iloc[test]

    # Configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

    # Model selection and search space definition
    model = RandomForestClassifier(random_state=1)
    # Search Variables definition
    space = {
        'n_estimators': [50, 70, 110],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    export_name = 'RF'

    # Search definition
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)

    # Search
    result = search.fit(X_train, y_train)

    # Save the best model
    best_model = result.best_estimator_

    # Model Evaluation
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)

    # Store result for the outer one
    outer_results.append({'Model': 'Random_Forest', 'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1_score': f1})
    
    # Plot the results and the most important info of the model
    # Feature Importance
    feature_importance = best_model.feature_importances_
    feature_names = best_model.feature_names_in_

    # Plot Feature Importance
    plt.figure(figsize=(20, 12))
    ax = sns.barplot(x=feature_importance, y=feature_names, palette="viridis")
    plt.title('Feature Importance in Random Forest', fontweight='bold')
    plt.xlabel('Importance', fontweight='bold')
    plt.ylabel('Features', fontweight='bold')
    for i in range(len(feature_importance)):
        ax.text(feature_importance[i], i, f'{feature_names[i]}: {feature_importance[i]:.4f}', ha='left',
                va='center')
    plt.tight_layout()
    sns.set_style("whitegrid")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)
    ax.set_xlim(0, max(feature_importance) * 1.1)
    os.makedirs(f"Forest/{num_forests}", exist_ok=True)
    plt.savefig(f"Forest/{num_forests}/feature_importance.png")

    # Forest plot
    for tree_index, tree_estimator in enumerate(best_model.estimators_):
        plt.figure(figsize=(192, 108))
        plot_tree(tree_estimator, feature_names=best_model.feature_names_in_,
                  class_names=['Alternative', 'Pop', 'Techno', 'Dance', 'Rock', 'Classical'],
                  filled=True, rounded=True)
        plt.title(f"Decision Tree {tree_index}", fontweight='bold')
        plt.savefig(f"Forest/{num_forests}/{tree_index}.png")
        plt.close()

        # Report progress
        print(f'>acc={acc:.3f}, est={result.best_score_:.3f}, cfg={result.best_params_}')
        print(f'>precision={precision:.3f}, rll={recall:.3f}, f1={f1:.3f}')
        num_forests += 1

### Classification Tree
In this model, the steps are as follows:
1. Load the data
2. Execute the cross validation method over the data
3. Define the model and its space
4. Apply the search definition for the cross validation
5. Train the models and return the best one
6. Test the model and save the results

In [ ]:
# Result list
outer_results = list()

# Other variables
num_trees = 0

# Enumerate splits
cv_outer = KFold(n_splits=10, shuffle=True, random_state=2)

for train, test in cv_outer.split(data.iloc[:, 0:6]):
    # Split data
    X_train, X_test = X.iloc[train, 0:6], X.iloc[test, 0:6]
    y_train, y_test = y.iloc[train], y.iloc[test]

    # Configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

    # Model selection and search space definition
    # Model selection
    model = DecisionTreeClassifier(random_state=1)
    # Search Variables definition
    space = [
        {'criterion': ['gini', 'entropy', 'log_loss']},
        {'splitter': ['best', 'random']},
        {'max_leaf_nodes': list(range(2, 100))},
        {'min_samples_split': [2, 3, 4]}
    ]
    export_name = 'DT'

    # Search definition
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)

    # Search
    result = search.fit(X_train, y_train)

    # Save the best model
    best_model = result.best_estimator_

    # Model Evaluation
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)

    # Store result for the outer one
    outer_results.append({'Model': 'Classification', 'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1_score': f1})

    # Plot and print the results
    num_trees += 1
    fig = plt.figure(figsize=(192, 108))
    _ = tree.plot_tree(best_model,
                       feature_names=['Beats per song', 'Danceability', 'Loudness (dB)',
                                      'Spectral Rolloff', 'Spectral Centroid'],
                       class_names=['Alternative', 'Pop', 'Techno', 'Dance', 'Rock', 'Classical'],
                       filled=True)
    plt.savefig(f"ClassifTree/{best_model}_{num_trees}.png")

    # Report progress
    print(f'>acc={acc:.3f}, est={result.best_score_:.3f}, cfg={result.best_params_}')
    print(f'>precision={precision:.3f}, rll={recall:.3f}, f1={f1:.3f}')