In [None]:
import zipfile
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE

In [None]:
# Load the data
# zip_path = '/content/sample_data/Anuran Calls (MFCCs).zip'
# extract_dir = '/content/sample_data/anuran_calls'
zip_path = 'Anuran Calls (MFCCs).zip'
extract_dir = 'anuran_calls'
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
csv_file = None
for file in os.listdir(extract_dir):
    if file.endswith('.csv'):
        csv_file = os.path.join(extract_dir, file)
        break
data = pd.read_csv(csv_file)

In [7]:
# i)Research exact match and hamming score/ loss methods for evaluating multi-
# label classification and use them in evaluating the classifiers in this problem

# Select features and labels
X = data.select_dtypes(include=[np.number])
Y_family = data['Family']
Y_genus = data['Genus']
Y_species = data['Species']
enc_family = LabelEncoder()
enc_genus = LabelEncoder()
enc_species = LabelEncoder()
Y_family_encoded = enc_family.fit_transform(Y_family)
Y_genus_encoded = enc_genus.fit_transform(Y_genus)
Y_species_encoded = enc_species.fit_transform(Y_species)
# Spilt the data int train and test
X_train, X_test, y_family_train, y_family_test, y_genus_train, y_genus_test, y_species_train, y_species_test = train_test_split(
    X, Y_family_encoded, Y_genus_encoded, Y_species_encoded, test_size=0.2, random_state=42
)
# Train the SVM classifier
clf_family = SVC(kernel='linear', probability=True)
clf_genus = SVC(kernel='linear', probability=True)
clf_species = SVC(kernel='linear', probability=True)
clf_family.fit(X_train, y_family_train)
clf_genus.fit(X_train, y_genus_train)
clf_species.fit(X_train, y_species_train)
# Prediction
y_family_pred = clf_family.predict(X_test)
y_genus_pred = clf_genus.predict(X_test)
y_species_pred = clf_species.predict(X_test)
y_true = np.vstack([y_family_test, y_genus_test, y_species_test]).T
y_pred = np.vstack([y_family_pred, y_genus_pred, y_species_pred]).T
# Evalutate and report the  results
exact_match = np.mean(np.all(y_true == y_pred, axis=1))
hamming = np.mean(y_true !=y_pred)
hamming_score = 1 -hamming
print(f"Exact Match Rate: {exact_match:.2f}")
print(f"Hamming Loss: {hamming:.2f}")
print(f"Hamming Score: {hamming_score:.2f}")

Loaded file: /content/sample_data/anuran_calls/Frogs_MFCCs.csv
=== Evaluation Metrics ===
Exact Match Ratio: 0.9625
Hamming Loss: 0.0139
Hamming Score: 0.9861


In [8]:
# i)Train a SVM for each of the labels, using Gaussian kernels and one versus all
# classifiers. Determine the weight of the SVM penalty and the width of the
# Gaussian Kernel using 10 fold cross validation.

# Define the grid
param_grid = {'C': [0.1, 1, 10, 100],'gamma': [0.01, 0.1, 1, 10],'kernel': ['rbf']}
# Determine the weight of the SVM penalty and the width of the
# Gaussian Kernel using 10 fold cross validation
svc = SVC()
grid_search_family = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1)
grid_search_genus = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1)
grid_search_species = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1)
# Fit the model with train data
grid_search_family.fit(X_train, y_family_train)
grid_search_genus.fit(X_train, y_genus_train)
grid_search_species.fit(X_train, y_species_train)
# Retrieve the best model
clf_family = grid_search_family.best_estimator_
clf_genus = grid_search_genus.best_estimator_
clf_species = grid_search_species.best_estimator_
# Predict the test set label
y_family_pred = clf_family.predict(X_test)
y_genus_pred = clf_genus.predict(X_test)
y_species_pred = clf_species.predict(X_test)
y_true = np.vstack([y_family_test, y_genus_test, y_species_test]).T
y_pred = np.vstack([y_family_pred, y_genus_pred, y_species_pred]).T
# Report the  results
exact_match = np.mean(np.all(y_true == y_pred, axis=1))
hamming = np.mean(y_true != y_pred)
hamming_score = 1 - hamming
print(f"Exact Match Rate: {exact_match:.2f}")
print(f"Hamming Loss: {hamming:.2f}")
print(f"Hamming Score: {hamming_score:.2f}")

Family best params: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Genus best params: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Species best params: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
=== Evaluation Metrics ===
Exact Match Ratio: 0.9993
Hamming Loss: 0.0007
Hamming Score: 0.9993


In [10]:
# iii) Repeat 6(b)ii with L1-penalized SVMs.R
# Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Define the grid
param_grid = {'C': [0.01, 0.1, 1, 10, 100],'penalty': ['l1'],'dual': [False]}
# Determine the weight of the SVM penalty and the width of the
# Gaussian Kernel using 10 fold cross validation
svc_l1 = LinearSVC(max_iter=5000)
grid_search_family = GridSearchCV(svc_l1, param_grid, cv=10, n_jobs=-1)
grid_search_genus = GridSearchCV(svc_l1, param_grid, cv=10, n_jobs=-1)
grid_search_species = GridSearchCV(svc_l1, param_grid, cv=10, n_jobs=-1)
# Fit the model with train data
grid_search_family.fit(X_train, y_family_train)
grid_search_genus.fit(X_train, y_genus_train)
grid_search_species.fit(X_train, y_species_train)
# Retrieve the best model
clf_family = grid_search_family.best_estimator_
clf_genus = grid_search_genus.best_estimator_
clf_species = grid_search_species.best_estimator_
# Predict the test set label
y_family_pred = clf_family.predict(X_test)
y_genus_pred = clf_genus.predict(X_test)
y_species_pred = clf_species.predict(X_test)
y_true = np.vstack([y_family_test, y_genus_test, y_species_test]).T
y_pred = np.vstack([y_family_pred, y_genus_pred, y_species_pred]).T
# Evaluate and report the results
exact_match = np.mean(np.all(y_true == y_pred, axis=1))
hamming = np.mean(y_true != y_pred)
hamming_score = 1 - hamming
print(f"Exact Match Rate: {exact_match:.2f}")
print(f"Hamming Loss: {hamming:.2f}")
print(f"Hamming Score: {hamming_score:.2f}")



=== Evaluation Metrics (L1-penalized SVMs) ===
Exact Match Ratio: 0.9521
Hamming Loss: 0.0220
Hamming Score: 0.9780




In [11]:
# iv) Repeat 6(b)iii by using SMOTE or any other method you know to remedy
# class imbalance. Report your conclusions about the classifiers you trained
# Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Spilt the data int train and test set
X_train, X_test, y_family_train, y_family_test, y_genus_train, y_genus_test, y_species_train, y_species_test = train_test_split(
    X_scaled, Y_family_encoded, Y_genus_encoded, Y_species_encoded, test_size = 0.2, random_state = 42
)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_family_smote, y_family_smote = smote.fit_resample(X_train, y_family_train)
X_genus_smote, y_genus_smote = smote.fit_resample(X_train, y_genus_train)
X_species_smote, y_species_smote = smote.fit_resample(X_train, y_species_train)
# Set up the L1-penalized LinearSVC and grid
svc_l1 = LinearSVC(penalty='l1', dual=False, max_iter=5000)
# Do 10-fold Grid Search Cross-Validation
grid_search_family = GridSearchCV(svc_l1, {'C': [0.01, 0.1, 1, 10, 100]}, cv=10, n_jobs=-1)
grid_search_family.fit(X_family_smote, y_family_smote)
clf_family = grid_search_family.best_estimator_
grid_search_genus = GridSearchCV(svc_l1, {'C': [0.01, 0.1, 1, 10, 100]}, cv=10, n_jobs=-1)
grid_search_genus.fit(X_genus_smote, y_genus_smote)
clf_genus = grid_search_genus.best_estimator_
grid_search_species = GridSearchCV(svc_l1, {'C': [0.01, 0.1, 1, 10, 100]}, cv=10, n_jobs=-1)
grid_search_species.fit(X_species_smote, y_species_smote)
clf_species = grid_search_species.best_estimator_
# Predict
y_family_pred = clf_family.predict(X_test)
y_genus_pred = clf_genus.predict(X_test)
y_species_pred = clf_species.predict(X_test)
y_true = np.vstack([y_family_test, y_genus_test, y_species_test]).T
y_pred = np.vstack([y_family_pred, y_genus_pred, y_species_pred]).T
# Evaluate and report the  results
exact_match = np.mean(np.all(y_true == y_pred, axis=1))
hamming = np.mean(y_true != y_pred)
hamming_score = 1 - hamming
print(f"Exact Match Rate: {exact_match:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Hamming Score: {hamming_score:.4f}")



=== Evaluation Metrics (L1-penalized SVMs + SMOTE) ===
Exact Match Ratio: 0.9423
Hamming Loss: 0.0269
Hamming Score: 0.9731


