In [13]:
# ML libs
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# audio / image / torch
import librosa, librosa.display
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, models, datasets

# plotting & utils
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np, pandas as pd, os, io, random
from PIL import Image
from tqdm import tqdm

# Gradio
import gradio as gr

# ---------- Paths ----------
AUDIO_DIR = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/genres_original"
IMG_DIR   = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original"
CSV_FILE  = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/features_30_sec.csv"
WORK_DIR  = "/kaggle/working"
os.makedirs(WORK_DIR, exist_ok=True)

print("Working dir:", WORK_DIR)

Working dir: /kaggle/working


In [14]:
SAMPLE_RATE = 22050
DURATION = 30
RANDOM_STATE = 42

def discover_genres_from_audio_dir(audio_dir=AUDIO_DIR):
    return sorted([d for d in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, d))]) if os.path.exists(audio_dir) else []

def list_image_classes(img_dir=IMG_DIR):
    return sorted([d for d in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, d))]) if os.path.exists(img_dir) else []

def load_audio(path, sr=SAMPLE_RATE, duration=DURATION):
    y, _ = librosa.load(path, sr=sr, mono=True, duration=duration)
    if len(y) < sr * duration:
        y = np.pad(y, (0, max(0, sr * duration - len(y))))
    return y

def make_mel_spectrogram_image(y, sr=SAMPLE_RATE, n_mels=128, fmax=8000):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmax=fmax)
    S_db = librosa.power_to_db(S, ref=np.max)
    fig = plt.Figure(figsize=(3,3), dpi=100)
    ax = fig.add_subplot(111); ax.axis('off')
    librosa.display.specshow(S_db, sr=sr, fmax=fmax, ax=ax)
    buf = io.BytesIO()
    fig.savefig(buf, bbox_inches='tight', pad_inches=0)
    plt.close(fig); buf.seek(0)
    return Image.open(buf).convert('RGB')

def extract_features_from_audio(path, sr=SAMPLE_RATE, n_mfcc=20):
    y = load_audio(path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feat

In [15]:
df = pd.read_csv(CSV_FILE)
print("CSV shape:", df.shape)
print("Columns:", df.columns[:20])

label_col = next((c for c in df.columns if c.lower() in ("label","genre")), None)
filename_col = next((c for c in df.columns if "file" in c.lower()), None)

print("Detected label column:", label_col)
print(df[label_col].value_counts())
df.head()

CSV shape: (1000, 60)
Columns: Index(['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
       'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean'],
      dtype='object')
Detected label column: label
label
blues        100
classical    100
country      100
disco        100
hiphop       100
jazz         100
metal        100
pop          100
reggae       100
rock         100
Name: count, dtype: int64


Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [16]:
SUBSET_PER_CLASS = 100

X_df = df.drop(columns=[c for c in (label_col, filename_col) if c in df.columns])
X, y = X_df.values, df[label_col].values

# Subsample
tmp = pd.DataFrame(X); tmp[label_col] = y
sampled = tmp.groupby(label_col).apply(lambda g: g.sample(min(len(g), SUBSET_PER_CLASS), random_state=RANDOM_STATE)).reset_index(drop=True)
y = sampled[label_col].values
X = sampled.drop(columns=[label_col]).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
clf = Pipeline([("scaler", StandardScaler()), ("rf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1))])
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
print("Tabular accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
joblib.dump(clf, os.path.join(WORK_DIR, "tabular_rf.joblib"))

Tabular accuracy: 0.795
              precision    recall  f1-score   support

       blues       0.65      0.65      0.65        20
   classical       0.86      0.90      0.88        20
     country       0.62      0.65      0.63        20
       disco       0.88      0.75      0.81        20
      hiphop       0.89      0.80      0.84        20
        jazz       0.80      0.80      0.80        20
       metal       0.90      0.95      0.93        20
         pop       0.95      1.00      0.98        20
      reggae       0.72      0.90      0.80        20
        rock       0.69      0.55      0.61        20

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.79       200



['/kaggle/working/tabular_rf.joblib']

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_transforms = {
    "train": transforms.Compose([transforms.Resize((224,224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(),
                                 transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])]),
    "val": transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(),
                               transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
}

full_ds = datasets.ImageFolder(IMG_DIR, transform=data_transforms["train"])
print("Classes:", full_ds.classes)

n = len(full_ds)
train_size = int(0.8 * n)
val_size = n - train_size
train_ds, val_ds = torch.utils.data.random_split(full_ds, [train_size, val_size])
val_ds.dataset.transform = data_transforms["val"]

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False, num_workers=2)

# Model
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, len(full_ds.classes))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(Xb)
        loss = criterion(out, yb)
        loss.backward(); optimizer.step()
    # Validation
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            out = model(Xb)
            _, pred = torch.max(out, 1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)
    print(f"Epoch {epoch+1}: Val Acc = {correct/total:.4f}")

torch.save({"model": model.state_dict(), "classes": full_ds.classes}, os.path.join(WORK_DIR, "transfer_resnet18.pth"))

Classes: ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
Epoch 1: Val Acc = 0.6950
Epoch 2: Val Acc = 0.6900
Epoch 3: Val Acc = 0.7200


In [18]:
def predict_tabular_from_audio(audio_path):
    clf = joblib.load(os.path.join(WORK_DIR, "tabular_rf.joblib"))
    feats = extract_features_from_audio(audio_path).reshape(1,-1)
    probs = clf.predict_proba(feats)[0]
    return clf.classes_[np.argmax(probs)], dict(zip(clf.classes_, probs))

def predict_transfer_from_audio(audio_path):
    ckpt = torch.load(os.path.join(WORK_DIR, "transfer_resnet18.pth"), map_location="cpu")
    classes = ckpt["classes"]
    model = models.resnet18(weights=None)
    model.fc = nn.Linear(model.fc.in_features, len(classes))
    model.load_state_dict(ckpt["model"]); model.eval()

    y = load_audio(audio_path)
    img = make_mel_spectrogram_image(y)
    transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(),
                                    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])
    X = transform(img).unsqueeze(0)
    with torch.no_grad():
        out = model(X); probs = torch.softmax(out, 1)[0].numpy()
    return classes[int(np.argmax(probs))], dict(zip(classes, probs))

In [20]:
def gr_wrapper(audio, model_choice):
    if model_choice == "Tabular":
        return predict_tabular_from_audio(audio)
    else:
        return predict_transfer_from_audio(audio)

iface = gr.Interface(
    fn=gr_wrapper,
    inputs=[gr.Audio(type="filepath"), gr.Radio(["Tabular","Transfer"], value="Transfer")],
    outputs=[gr.Textbox(label="Predicted Genre"), gr.Label(label="Probabilities")],
    title="GTZAN Music Genre Classifier 🎶"
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://892a02f9d3b66f0ae0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


