<a href="https://colab.research.google.com/github/DARKINOO/Pytorch_practice/blob/main/music_sys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchaudio torchvision librosa numpy matplotlib scikit-learn seaborn tqdm soundfile optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m400.9/400.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"beingzyash","key":"87a0df50b4d9f47039488b4149fa02da"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [4]:
!kaggle datasets download -d carlthome/gtzan-genre-collection
!unzip -q gtzan-genre-collection.zip -d /content/GTZAN


Dataset URL: https://www.kaggle.com/datasets/carlthome/gtzan-genre-collection
License(s): copyright-authors
Downloading gtzan-genre-collection.zip to /content
 98% 1.12G/1.14G [00:10<00:00, 234MB/s]
100% 1.14G/1.14G [00:10<00:00, 120MB/s]


In [5]:
import os

BASE_DIR = "/content/GTZAN/genres"
print("Genres:", os.listdir(BASE_DIR))


Genres: ['hiphop', 'country', 'disco', 'rock', 'metal', 'blues', 'jazz', 'pop', 'reggae', 'classical']


In [20]:
import librosa
import numpy as np
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt



In [19]:
SAMPLE_RATE = 22050
DURATION = 30
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

genres = os.listdir(BASE_DIR)
data = {"mfcc": [], "label":[]}

for genre in genres:
  folder = os.path.join(BASE_DIR,genre)
  for filename in os.listdir(folder):
    file_path = os.path.join(folder,filename)
    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    if len(signal) >= SAMPLES_PER_TRACK:
      signal = signal[:SAMPLES_PER_TRACK]
      mfcc = librosa.feature.mfcc(y=signal,sr=sr, n_mfcc=40,n_fft=2048,hop_length=512)
      mfcc = np.mean(mfcc.T, axis=0)
      data["mfcc"].append(mfcc)
      data["label"].append(genre)

print(f"Extracted {len(data['mfcc'])} feature samples")

Extracted 991 feature samples


In [9]:
X = np.array(data["mfcc"])
y = np.array(data["label"])

encoder = LabelEncoder()
y = encoder.fit_transform(y)

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

dataset = torch.utils.data.TensorDataset(X,y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [10]:
class GenreClassifier(nn.Module):
  def __init__(self,input_size,hidden_size,num_classes):
    super(GenreClassifier,self).__init__()
    self.fc1 = nn.Linear(input_size,hidden_size)
    self.bn1 = nn.BatchNorm1d(hidden_size)
    self.fc2 = nn.Linear(hidden_size,hidden_size // 2)
    self.bn2 = nn.BatchNorm1d(hidden_size // 2)
    self.fc3 = nn.Linear(hidden_size // 2,num_classes)

  def forward(self,x):
    x = F.relu(self.bn1(self.fc1(x)))
    x = F.dropout(x, 0.3)
    x = F.relu(self.bn2(self.fc2(x)))
    x = self.fc3(x)
    return x

In [12]:
input_size = X.shape[1]
num_classes = len(encoder.classes_)
param_grid = [
    {"hidden_size": 256, "lr": 0.001},
    {"hidden_size": 512, "lr": 0.0005},
    {"hidden_size": 128, "lr": 0.01},
]

best_acc = 0
best_model = None

for params in param_grid:
  model = GenreClassifier(input_size,params["hidden_size"],num_classes)
  optimizer = optim.Adam(model.parameters(),lr=params["lr"])
  criterion = nn.CrossEntropyLoss()

  for epoch in range(20):
    model.train()
    for xb, yb in train_loader:
      optimizer.zero_grad()
      out = model(xb)
      loss = criterion(out,yb)
      loss.backward()
      optimizer.step()


  model.eval()
  preds,actual = [], []
  with torch.no_grad():
    for xb, yb in test_loader:
      out = model(xb)
      preds += torch.argmax(out, dim=1).tolist()
      actual += yb.tolist()

  acc = accuracy_score(actual, preds)
  print(f"Params {params} : Accuracy: {acc:.4f}")

  if acc > best_acc:
        best_acc = acc
        best_model = model.state_dict()
        best_params = params

Params {'hidden_size': 256, 'lr': 0.001} : Accuracy: 0.6131
Params {'hidden_size': 512, 'lr': 0.0005} : Accuracy: 0.6131
Params {'hidden_size': 128, 'lr': 0.01} : Accuracy: 0.5829


In [13]:
torch.save(best_model, "best_genre_model.pth")
print("Best model saved with accuracy:", best_acc)
print("Best parameters:", best_params)


Best model saved with accuracy: 0.6130653266331658
Best parameters: {'hidden_size': 256, 'lr': 0.001}


In [23]:
# ‚úÖ Re-import to avoid shadowing errors
import librosa
import torch
import numpy as np
import torch.nn.functional as F

def predict_genre(file_path, model, encoder, sample_rate=22050, n_mfcc=40):
    """
    Predicts the genre of a given audio file using the trained PyTorch model.
    Supports .wav, .mp3, .flac etc.
    Prints top-3 genres with confidence scores.
    """

    # 1Ô∏è‚É£ Load & Resample audio
    signal, sr = librosa.load(file_path, sr=sample_rate, mono=True)

    # 2Ô∏è‚É£ Handle short clips by padding
    if len(signal) < sample_rate * 5:  # at least 5 seconds
        pad_len = (sample_rate * 5) - len(signal)
        signal = np.pad(signal, (0, pad_len), mode='constant')

    # 3Ô∏è‚É£ Extract MFCC features
    mfccs = librosa.feature.mfcc(
        y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=2048, hop_length=512
    )
    mfccs = np.mean(mfccs.T, axis=0)
    x = torch.tensor(mfccs, dtype=torch.float32).unsqueeze(0)

    # 4Ô∏è‚É£ Model inference
    model.eval()
    with torch.no_grad():
        logits = model(x)
        probs = F.softmax(logits, dim=1)
        top3 = torch.topk(probs, 3)

    # 5Ô∏è‚É£ Decode labels
    top3_indices = top3.indices[0].tolist()
    top3_scores = top3.values[0].tolist()
    top3_genres = encoder.inverse_transform(top3_indices)

    # 6Ô∏è‚É£ Print top-3 predictions
    print("üéµ Predicted Genres (Top-3):")
    for genre, score in zip(top3_genres, top3_scores):
        print(f"   {genre:<10}  ‚Üí  {score*100:.2f}% confidence")

    return top3_genres[0]

# üîπ Load your best model (make sure it matches best_params)
model = GenreClassifier(input_size, best_params["hidden_size"], num_classes)
model.load_state_dict(torch.load("best_genre_model.pth", map_location='cpu'))

# üîπ Example usage
test_file = os.path.join(BASE_DIR, "rock", os.listdir(os.path.join(BASE_DIR, "rock"))[0])
predicted_genre = predict_genre(test_file, model, encoder)
print("\nüé∂ Final Prediction:", predicted_genre)


üéµ Predicted Genres (Top-3):
   rock        ‚Üí  59.81% confidence
   country     ‚Üí  31.40% confidence
   metal       ‚Üí  2.70% confidence

üé∂ Final Prediction: rock


In [25]:
from google.colab import files
uploaded = files.upload()

test_file = list(uploaded.keys())[0]
predict_genre(test_file, model, encoder)


Saving dream.mp3 to dream.mp3
üéµ Predicted Genres (Top-3):
   hiphop      ‚Üí  56.94% confidence
   pop         ‚Üí  24.45% confidence
   reggae      ‚Üí  13.60% confidence


np.str_('hiphop')