# Music Genre Classification
## SVD + MFCC + Spectral Features with SVM

**Objective:** Achieve 75-80% accuracy on GTZAN dataset

**Method:** Enhanced SVD from STFT + MFCC + Spectral features + SVM classifier

## 1. Setup and Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
import time
import os

import librosa
from scipy.linalg import svd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
import joblib

np.random.seed(42)
plt.style.use('seaborn-v0_8-darkgrid')

print("All libraries imported successfully!")

## 2. Load Dataset

In [None]:
DATASET_PATH = '/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original'

print("="*70)
print("STEP 1: Loading Dataset")
print("="*70)

audio_data = []
labels = []
file_paths = []

genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 
          'jazz', 'metal', 'pop', 'reggae', 'rock']

for genre in genres:
    genre_path = Path(DATASET_PATH) / genre
    audio_files = list(genre_path.glob('*.wav'))
    
    print(f"Loading {genre}...", end=' ')
    for audio_file in audio_files:
        try:
            audio, sr = librosa.load(audio_file, sr=22050, duration=30)
            audio_data.append(audio)
            labels.append(genre)
            file_paths.append(str(audio_file))
        except Exception as e:
            print(f"\nError loading {audio_file.name}: {e}")
    
    print(f"Done - {len([l for l in labels if l == genre])} files")

print(f"\nTotal files loaded: {len(audio_data)}")
print(f"Genres: {len(genres)}")

## 3. Feature Extraction Functions

In [None]:
def extract_svd_enhanced(audio, n_components=50, sr=22050):
    """Extract SVD features from STFT with statistics"""
    try:
        stft = librosa.stft(audio, n_fft=2048, hop_length=512, window='hann')
        combined = np.vstack([stft.real, stft.imag])
        _, s, _ = svd(combined, full_matrices=False)
        
        svd_feat = s[:n_components]
        svd_stats = [
            np.mean(svd_feat),
            np.std(svd_feat),
            np.max(svd_feat),
            np.min(svd_feat),
            np.median(svd_feat),
            np.percentile(svd_feat, 25),
            np.percentile(svd_feat, 75),
            np.sum(svd_feat) / np.sum(s[:100])
        ]
        return np.concatenate([svd_feat, svd_stats])
    except:
        return np.zeros(n_components + 8)


def extract_mfcc_enhanced(audio, sr=22050):
    """Extract enhanced MFCC features"""
    features = []
    try:
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
        features.extend(np.mean(mfcc, axis=1))
        features.extend(np.std(mfcc, axis=1))
        
        mfcc_delta = librosa.feature.delta(mfcc)
        features.extend(np.mean(mfcc_delta, axis=1))
        
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        features.extend(np.mean(mfcc_delta2, axis=1))
    except:
        features = [0] * 80
    return np.array(features)


def extract_spectral_enhanced(audio, sr=22050):
    """Extract enhanced spectral features"""
    features = []
    try:
        spec_cent = librosa.feature.spectral_centroid(y=audio, sr=sr)
        features.extend([np.mean(spec_cent), np.std(spec_cent)])
        
        spec_bw = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
        features.extend([np.mean(spec_bw), np.std(spec_bw)])
        
        spec_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
        features.extend([np.mean(spec_rolloff), np.std(spec_rolloff)])
        
        spec_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
        features.extend(np.mean(spec_contrast, axis=1))
        features.extend(np.std(spec_contrast, axis=1))
        
        spec_flat = librosa.feature.spectral_flatness(y=audio)
        features.extend([np.mean(spec_flat), np.std(spec_flat)])
        
        rms = librosa.feature.rms(y=audio)
        features.extend([np.mean(rms), np.std(rms)])
        
        zcr = librosa.feature.zero_crossing_rate(audio)
        features.extend([np.mean(zcr), np.std(zcr)])
    except:
        features = [0] * 26
    return np.array(features)


def extract_all_features(audio, sr=22050):
    """Combine all features"""
    svd_feat = extract_svd_enhanced(audio, n_components=50, sr=sr)
    mfcc_feat = extract_mfcc_enhanced(audio, sr=sr)
    spectral_feat = extract_spectral_enhanced(audio, sr=sr)
    return np.concatenate([svd_feat, mfcc_feat, spectral_feat])


print("Feature extraction functions defined!")
print("\nFeature Configuration:")
print("  - SVD: 50 singular values + 8 statistics = 58 features")
print("  - MFCC: 20 coefficients Ã— 4 (mean, std, delta, delta2) = 80 features")
print("  - Spectral: Various spectral features with mean + std = 26 features")
print("  - Total: 164 features")

## 4. Extract Features from All Audio

In [None]:
print("="*70)
print("STEP 2: Feature Extraction")
print("="*70)
print("\nExtracting features from all audio files...")
print("(This will take approximately 15-20 minutes)\n")

features_list = []
start_time = time.time()

for i, audio in enumerate(tqdm(audio_data, desc="Extracting")):
    feat = extract_all_features(audio)
    features_list.append(feat)

features_array = np.array(features_list)
extraction_time = time.time() - start_time

print(f"\nFeature extraction completed!")
print(f"Time taken: {extraction_time/60:.1f} minutes")
print(f"Feature matrix shape: {features_array.shape}")

## 5. Train-Test Split

In [None]:
print("="*70)
print("STEP 3: Train-Test Split")
print("="*70)

X_train, X_test, y_train, y_test = train_test_split(
    features_array, 
    labels, 
    test_size=0.2, 
    stratify=labels, 
    random_state=42
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

train_dist = pd.Series(y_train).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("\nDistribution per genre:")
print(f"{'Genre':<12} {'Train':<8} {'Test':<8}")
print("-" * 30)
for genre in genres:
    print(f"{genre:<12} {train_dist[genre]:<8} {test_dist[genre]:<8}")

## 6. Preprocessing (Scaling & Feature Selection)

In [None]:
print("="*70)
print("STEP 4: Preprocessing")
print("="*70)

print("\nScaling features with RobustScaler...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Scaling completed")

print("\nSelecting top 100 features using mutual information...")
selector = SelectKBest(score_func=mutual_info_classif, k=100)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

print(f"Feature selection completed")
print(f"Final feature shape: {X_train_selected.shape}")

## 7. Model Training (Multiple Configurations)

In [None]:
print("="*70)
print("STEP 5: Model Training")
print("="*70)

print("\nTraining SVM with multiple configurations...\n")

configs = [
    {'C': 100, 'gamma': 0.01, 'name': 'Config 1'},
    {'C': 50, 'gamma': 0.1, 'name': 'Config 2'},
    {'C': 200, 'gamma': 0.001, 'name': 'Config 3'},
    {'C': 10, 'gamma': 'scale', 'name': 'Config 4'},
]

best_acc = 0
best_model = None
best_config = None

for config in configs:
    print(f"Testing {config['name']}: C={config['C']}, gamma={config['gamma']}...", end=' ')
    
    svm = SVC(
        kernel='rbf',
        C=config['C'],
        gamma=config['gamma'],
        class_weight='balanced',
        random_state=42
    )
    
    svm.fit(X_train_selected, y_train)
    y_pred = svm.predict(X_test_selected)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy: {acc:.2%}")
    
    if acc > best_acc:
        best_acc = acc
        best_model = svm
        best_config = config

print("\n" + "="*70)
print(f"Best Model: {best_config['name']}")
print(f"  C = {best_config['C']}")
print(f"  gamma = {best_config['gamma']}")
print(f"  Accuracy = {best_acc:.2%}")
print("="*70)

## 8. Model Evaluation

In [None]:
print("="*70)
print("STEP 6: Model Evaluation")
print("="*70)

y_pred_final = best_model.predict(X_test_selected)

print("\nOverall Performance:")
print(f"  Test Accuracy: {best_acc:.2%}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))

print("\nPer-Genre Accuracy:")
for genre in genres:
    mask = np.array(y_test) == genre
    if np.sum(mask) > 0:
        genre_acc = accuracy_score(
            np.array(y_test)[mask], 
            np.array(y_pred_final)[mask]
        )
        print(f"  {genre:<12}: {genre_acc:.1%}")

## 9. Visualizations

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_final, labels=genres)

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=genres,
    yticklabels=genres,
    cbar_kws={'label': 'Count'}
)
plt.title(f'Confusion Matrix\nTest Accuracy: {best_acc:.1%}', 
          fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("Saved: confusion_matrix.png")
plt.show()

In [None]:
# Per-genre performance bar chart
genre_accuracies = []
for genre in genres:
    mask = np.array(y_test) == genre
    if np.sum(mask) > 0:
        acc = accuracy_score(np.array(y_test)[mask], np.array(y_pred_final)[mask])
        genre_accuracies.append(acc)
    else:
        genre_accuracies.append(0)

plt.figure(figsize=(14, 6))
bars = plt.bar(genres, genre_accuracies, color='steelblue', alpha=0.8, edgecolor='black')
plt.axhline(y=best_acc, color='red', linestyle='--', linewidth=2, label=f'Overall: {best_acc:.1%}')
plt.axhline(y=0.75, color='green', linestyle='--', linewidth=2, label='Target: 75%')

for i, (bar, acc) in enumerate(zip(bars, genre_accuracies)):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.02,
             f'{acc:.1%}', ha='center', va='bottom', fontweight='bold', fontsize=10)

plt.xlabel('Genre', fontsize=12, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12, fontweight='bold')
plt.title('Per-Genre Classification Accuracy', fontsize=16, fontweight='bold', pad=20)
plt.ylim(0, 1.1)
plt.legend(fontsize=11, loc='lower right')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('genre_accuracy.png', dpi=300, bbox_inches='tight')
print("Saved: genre_accuracy.png")
plt.show()

## 10. Save Model and Results

In [None]:
print("="*70)
print("STEP 8: Saving Results")
print("="*70)

# Save model
model_package = {
    'scaler': scaler,
    'feature_selector': selector,
    'model': best_model,
    'config': best_config,
    'accuracy': best_acc,
    'genres': genres
}
joblib.dump(model_package, 'music_genre_classifier.pkl')
print("Saved: music_genre_classifier.pkl")

# Save results to CSV
results_data = {
    'Genre': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'Support': [],
    'Accuracy': []
}

precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred_final, labels=genres, average=None
)

for i, genre in enumerate(genres):
    results_data['Genre'].append(genre)
    results_data['Precision'].append(precision[i])
    results_data['Recall'].append(recall[i])
    results_data['F1-Score'].append(f1[i])
    results_data['Support'].append(support[i])
    results_data['Accuracy'].append(genre_accuracies[i])

results_df = pd.DataFrame(results_data)
results_df.to_csv('classification_results.csv', index=False)
print("Saved: classification_results.csv")

print("\nResults table:")
display(results_df)

## 11. Final Summary

In [None]:
print("="*70)
print("FINAL SUMMARY")
print("="*70)

print(f"\nModel Performance:")
print(f"  Test Accuracy: {best_acc:.2%}")
print(f"  Target: 75-80%")

if best_acc >= 0.75:
    print(f"  Status: TARGET ACHIEVED!")
else:
    print(f"  Status: Need {(0.75 - best_acc)*100:.1f} more percentage points")

print(f"\nBest Performing Genres:")
top_3 = sorted(zip(genres, genre_accuracies), key=lambda x: x[1], reverse=True)[:3]
for i, (genre, acc) in enumerate(top_3, 1):
    print(f"  {i}. {genre.capitalize()}: {acc:.1%}")

print(f"\nChallenging Genres:")
bottom_3 = sorted(zip(genres, genre_accuracies), key=lambda x: x[1])[:3]
for i, (genre, acc) in enumerate(bottom_3, 1):
    print(f"  {i}. {genre.capitalize()}: {acc:.1%}")

print(f"\nSaved Files:")
print(f"  - music_genre_classifier.pkl")
print(f"  - classification_results.csv")
print(f"  - confusion_matrix.png")
print(f"  - genre_accuracy.png")

print("\n" + "="*70)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print("="*70)