In [1]:
import pandas as pd

## (a) Load the Data Set and Divide It Into Training (and Test) Set.

In [2]:
# Load the data file.
df = pd.read_csv('Frogs_MFCCs.csv')

In [3]:
# Devide the dataset into training and test set.
df_train = df.sample(frac=0.7, random_state=42)
df_test = df.drop(df_train.index)

In [4]:
# Extract the features and labels.
X_train = df_train.iloc[:, 0:22]
y_train_family = df_train.iloc[:, 22]
y_train_genus = df_train.iloc[:, 23]
y_train_species = df_train.iloc[:, 24]

X_test = df_test.iloc[:, 0:22]
y_test_family = df_test.iloc[:, 22]
y_test_genus = df_test.iloc[:, 23]
y_test_species = df_test.iloc[:, 24]

## (b) Training and Testing the SVM models.

### (i) Exact Match and Hamming Score / Loss methods.

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import hamming_loss, accuracy_score

In [6]:
# Encode the labels into integer.
enc_family = LabelEncoder()
enc_genus = LabelEncoder()
enc_species = LabelEncoder()

y_test_family_enc = enc_family.fit_transform(y_test_family)
y_test_genus_enc = enc_genus.fit_transform(y_test_genus)
y_test_species_enc = enc_species.fit_transform(y_test_species)

# Cancatenate into a multi-label matrix.
Y_true = np.vstack([y_test_family_enc, y_test_genus_enc, y_test_species_enc]).T

# Functions used for exact match and hamming score / loss.
def exact_match(y_true, y_pred):
    return np.mean(np.all(y_true == y_pred, axis=1))

# Define the hamming score / loss function for multiclass.
def hamming_score(y_true, y_pred):
    total_labels = np.prod(y_true.shape)
    incorrect = np.sum(y_true != y_pred)
    return 1 - incorrect / total_labels

def hamming_loss_multi(y_true, y_pred):
    total_labels = np.prod(y_true.shape)
    incorrect = np.sum(y_true != y_pred)
    return incorrect / total_labels

### (ii) SVM with Gaussian kernels

In [7]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [8]:
# Label preparing
label_encoders = {}
Y_train = []
Y_test = []

for i, label in enumerate(['Family', 'Genus', 'Species']):
    l_enc = LabelEncoder()
    y_train_enc = l_enc.fit_transform(df_train.iloc[:, 22 + i])
    y_test_enc = l_enc.transform(df_test.iloc[:, 22 + i])
    Y_train.append(y_train_enc)
    Y_test.append(y_test_enc)
    label_encoders[label] = l_enc


In [9]:
# Training the SVM model.
svm_models = []
Y_pred = []

for i, y_train_enc in enumerate(Y_train):
    print(f"\nTraining SVM for label: {['Family', 'Genus', 'Species'][i]}")

    clf = OneVsRestClassifier(SVC(kernel='rbf', gamma='scale', C=1))

    # Cross validation
    scores = cross_val_score(clf, X_train, y_train_enc, cv=10)
    print("10-fold Cross Validation Accuracy: {:.4f} ± {:.4f}".format(scores.mean(), scores.std()))

    # Use training set fitting and predicting the data.
    clf.fit(X_train, y_train_enc)
    y_pred = clf.predict(X_test)
    Y_pred.append(y_pred)
    svm_models.append(clf)

# Splicing labels and evaluating the model.
Y_true = np.vstack(Y_test).T
Y_pred = np.vstack(Y_pred).T

print(" === Multi-Label Evaluation === ")
print("Exact Match Score: ", exact_match(Y_true, Y_pred))
print("Hamming Score: ", hamming_score(Y_true, Y_pred))
print("Hamming Loss: ", hamming_loss_multi(Y_true, Y_pred))



Training SVM for label: Family
10-fold Cross Validation Accuracy: 0.9674 ± 0.0044

Training SVM for label: Genus
10-fold Cross Validation Accuracy: 0.9684 ± 0.0046

Training SVM for label: Species
10-fold Cross Validation Accuracy: 0.9710 ± 0.0060
 === Multi-Label Evaluation === 
Exact Match Score:  0.9634089856415007
Hamming Score:  0.9762235602902578
Hamming Loss:  0.023776439709742164


### (iii) L1-Penalized SVM.

In [10]:
from sklearn.svm import LinearSVC

In [28]:
svm_l1_models = []
Y_pred_l1 = []

for i, y_train_enc in enumerate(Y_train):
    print(f"\nTraining L1-penalized SVM for label: {['Family', 'Genus', 'Species'][i]}")

    clf = OneVsRestClassifier(LinearSVC(penalty='l1', dual=False, C=1, max_iter=20000))

    # 10 fold CV
    scores = cross_val_score(clf, X_train, y_train_enc, cv=10)
    print("10-fold Cross Validation Accuracy: {:.4f} ± {:.4f}".format(scores.mean(), scores.std()))

    clf.fit(X_train, y_train_enc)
    y_pred = clf.predict(X_test)
    Y_pred_l1.append(y_pred)
    svm_l1_models.append(clf)

# Splicing labels.
Y_pred_l1 = np.vstack(Y_pred_l1).T

# Evaluating the model.
print(" === Multi-Label Evaluation for L1-Penalized SVM === ")
print("Exact Match Score: ", exact_match(Y_true, Y_pred_l1))
print("Hamming Score: ", hamming_score(Y_true, Y_pred_l1))
print("Hamming Loss: ", hamming_loss_multi(Y_true, Y_pred_l1))


Training L1-penalized SVM for label: Family
10-fold Cross Validation Accuracy: 0.9321 ± 0.0109

Training L1-penalized SVM for label: Genus
10-fold Cross Validation Accuracy: 0.9398 ± 0.0080

Training L1-penalized SVM for label: Species
10-fold Cross Validation Accuracy: 0.9523 ± 0.0082
 === Multi-Label Evaluation for L1-Penalized SVM === 
Exact Match Score:  0.9138490041685966
Hamming Score:  0.9479697390767331
Hamming Loss:  0.052030260923266944


### (iv) L1-Penalized SVM with SMOTE.

In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from collections import Counter

In [21]:
# Using SMOTE on each label
X_train_smote_list = []
y_train_smote_list = []

for i, y_train_enc in enumerate(Y_train):
    print(f"\nTraining L1-penalized SVM with SMOTE for label: {['Family', 'Genus', 'Species'][i]}")

    # Count the number of classes in current label
    counter = Counter(y_train_enc)
    # Find the maximum number of samples
    max_samples = max(counter.values())

    # Create new sampling_strategy:
    target_samples = int(max_samples * 0.5)
    sampling_strategy = {cls: min(target_samples, n_samples) if n_samples < target_samples else n_samples 
                         for cls, n_samples in counter.items()}

    print(f"Sampling strategy: {sampling_strategy}")

    # Create dynamic smote strategy for each column.
    smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train_enc)
    X_train_smote_list.append(X_resampled)
    y_train_smote_list.append(y_resampled)

# Training SMOTE L1-SVM
svm_l1_smote_models = []
Y_pred_l1_smote = []

for i in range(3):
    print(f"\nTraining L1-SVM after SMOTE for label: {['Family', 'Genus', 'Species'][i]}")

    clf = OneVsRestClassifier(LogisticRegression(penalty='l1', solver='saga', C=1, max_iter=5000))

    # Cross validation by using data after SMOTE
    scores = cross_val_score(clf, X_train_smote_list[i], y_train_smote_list[i], cv=10)
    print("10-fold Cross Validation Accuracy after SMOTE: {:.4f} ± {:.4f}".format(scores.mean(), scores.std()))

    clf.fit(X_train_smote_list[i], y_train_smote_list[i])
    y_pred = clf.predict(X_test)
    Y_pred_l1_smote.append(y_pred)
    svm_l1_smote_models.append(clf)

Y_pred_l1_smote = np.vstack(Y_pred_l1_smote).T

print(" === Multi-Label Evaluation for L1-Penalized SVM === ")
print("Exact Match Score: ", exact_match(Y_true, Y_pred_l1_smote))
print("Hamming Score: ", hamming_score(Y_true, Y_pred_l1_smote))
print("Hamming Loss: ", hamming_loss_multi(Y_true, Y_pred_l1_smote))


Training L1-penalized SVM with SMOTE for label: Family
Sampling strategy: {3: 3097, 2: 1507, 1: 379, 0: 53}

Training L1-penalized SVM with SMOTE for label: Genus
Sampling strategy: {0: 2898, 3: 1114, 1: 379, 2: 218, 7: 95, 4: 199, 6: 53, 5: 80}

Training L1-penalized SVM with SMOTE for label: Species
Sampling strategy: {1: 2420, 4: 327, 5: 787, 0: 478, 2: 379, 3: 218, 9: 95, 6: 199, 8: 53, 7: 80}

Training L1-SVM after SMOTE for label: Family
10-fold Cross Validation Accuracy after SMOTE: 0.9279 ± 0.0126

Training L1-SVM after SMOTE for label: Genus
10-fold Cross Validation Accuracy after SMOTE: 0.9343 ± 0.0058

Training L1-SVM after SMOTE for label: Species
10-fold Cross Validation Accuracy after SMOTE: 0.9460 ± 0.0071
 === Multi-Label Evaluation for L1-Penalized SVM === 
Exact Match Score:  0.9096804075961094
Hamming Score:  0.9450362822294271
Hamming Loss:  0.054963717770572795


### (v) Try to solve the problem with Classifier Chain method.

In [23]:
from sklearn.multioutput import ClassifierChain

In [25]:
base_lr = LogisticRegression(penalty='l1', solver='saga', C=1, max_iter=5000)

# Build the Classifier Chain
chain = ClassifierChain(base_lr, order='random', random_state=42)

# Make all labels a matrix
Y_train_matrix = np.vstack(Y_train).T
Y_test_matrix = np.vstack(Y_test).T

# Model training
chain.fit(X_train, Y_train_matrix)

# Predicting
Y_pred_chain = chain.predict(X_test)

# Evaluating
print("\n== Multi-Label Evaluation for Classifier Chain ==")
print("Exact Match Score:", exact_match(Y_test_matrix, Y_pred_chain))
print("Hamming Score:", hamming_score(Y_test_matrix, Y_pred_chain))
print("Hamming Loss:", hamming_loss_multi(Y_test_matrix, Y_pred_chain))


== Multi-Label Evaluation for Classifier Chain ==
Exact Match Score: 0.9231125521074571
Hamming Score: 0.9288250733364212
Hamming Loss: 0.07117492666357882


### (vi) Confusion Matrices, Precision, Recall, ROC and AUC

In [46]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

In [67]:
def evaluate_multilabel_model(Y_true, Y_pred, Y_score, model_name, label_names=['Family', 'Genus', 'Species']):
    print(f"\n==== {model_name} ===\n")

    for i, label_name in enumerate(label_names):
        print(f"--- {label_name} ---")

        # Confusion Matrix
        cm = confusion_matrix(Y_true[:, i], Y_pred[:, i])
        print("Confusion Matrix:\n", cm)

        # Classification Report (Precision, Recall, F1)
        print(classification_report(Y_true[:, i], Y_pred[:, i], zero_division=0))

        # AUC and ROC
        if Y_score is not None:
            n_classes = len(np.unique(Y_true[:, i]))
            if n_classes == 2:
                # Binary class
                auc_value = roc_auc_score(Y_true[:, i], Y_score[:, i])
                print(f"{label_name} AUC: {auc_value:.4f}")

                fpr, tpr, thresholds = roc_curve(Y_true[:, i], Y_score[:, i])

                plt.figure()
                plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc_value:.4f})')
                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC Curve for {label_name} - {model_name}')
                plt.legend(loc="lower right")
                plt.show()

            else:
                # Multiclass
                print(f"{label_name} is multiclass ({n_classes} classes), skip AUC and ROC curve (need full predict_proba matrix)")

In [68]:
Y_score_smote = []

for model in svm_l1_smote_models:
    score = model.predict_proba(X_test)
    if score.ndim == 3:
        score = score[:, :, 1]
    Y_score_smote.append(score[:, 1])  

Y_score_smote = np.vstack(Y_score_smote).T

In [69]:
Y_score_chain = []

X_aug = X_test.to_numpy().copy()

for idx, estimator in enumerate(chain.estimators_):
    score = estimator.predict_proba(X_aug)
    
    if score.shape[1] == 2:
        Y_score_chain.append(score[:, 1])
    else:
        Y_score_chain.append(np.max(score, axis=1))
    
    X_aug = np.hstack((X_aug, np.argmax(score, axis=1).reshape(-1, 1)))

# Stack
Y_score_chain = np.vstack(Y_score_chain).T

In [70]:
evaluate_multilabel_model(Y_true, Y_pred, None, "Gaussian Kernel SVM")
evaluate_multilabel_model(Y_true, Y_pred_l1, None, "L1-penalized Linear SVM")
evaluate_multilabel_model(Y_true, Y_pred_l1_smote, Y_score_smote, "L1-penalized Logistic Regression with SMOTE")
evaluate_multilabel_model(Y_true, Y_pred_chain, Y_score_chain, "Classifier Chain Logistic Regression")


==== Gaussian Kernel SVM ===

--- Family ---
Confusion Matrix:
 [[   0    0   13    2]
 [   0  159    4    0]
 [   0    1  640   17]
 [   0    2   10 1311]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.98      0.98      0.98       163
           2       0.96      0.97      0.97       658
           3       0.99      0.99      0.99      1323

    accuracy                           0.98      2159
   macro avg       0.73      0.73      0.73      2159
weighted avg       0.97      0.98      0.97      2159

--- Genus ---
Confusion Matrix:
 [[1245    2    2    3    0    0    0    0]
 [   0  161    2    0    0    0    0    0]
 [  12    1   78    0    0    0    0    1]
 [   3    0    1  472    2    1    0    0]
 [   1    0    1    3   66    0    0    0]
 [   1    0    2    9    0   22    0    0]
 [   1    0    0    0    1    0   13    0]
 [   0    0    0    3    0    0    0   50]]
              precision    re