In [1]:
!pip install pypdf2 pdf2image

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
import tarfile
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader
from pdf2image import convert_from_path
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
def find_pdf_path(pdf_root, pdf_filename):
    """ Recursively search for a PDF in all subdirectories. """
    for root, _, files in os.walk(pdf_root):
        if pdf_filename in files:
            return os.path.join(root, pdf_filename)  # Return full path
    return None  # PDF not found

Convert PDFs to images

In [4]:
def pdf_to_image(pdf_path, output_folder="images", dpi=200):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if not os.path.isfile(pdf_path):
        return None  # Skip missing PDFs

    try:
        images = convert_from_path(pdf_path, dpi=dpi)
        image_path = os.path.join(output_folder, os.path.basename(pdf_path).replace(".pdf", ".jpg"))
        images[0].save(image_path, "JPEG")  # Save first page only
        return image_path
    except:
        return None  # Conversion failed

Custon PyTorch dataset

In [11]:
class SheetMusicDataset(Dataset):
    def __init__(self, csv_file, pdf_root, transform=None, sample_size=500):
        self.data = pd.read_csv(csv_file)
        self.pdf_root = pdf_root
        self.transform = transform
        self.genre_mapping = {genre: idx for idx, genre in enumerate(self.data['genres'].dropna().unique())}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        if pd.isna(row['pdf']) or not isinstance(row['pdf'], str):
            return None

        # Fix the path by removing "./pdf/" and joining with the correct root folder
        relative_path = row['pdf'].strip().replace("./pdf/", "")  # Remove "./pdf/"
        pdf_path = os.path.join(self.pdf_root, relative_path)  # Join with the root path
        
        # Ensure the path uses the correct format for the OS
        pdf_path = os.path.normpath(pdf_path)

        if not os.path.isfile(pdf_path):
            print(f"🚨 PDF not found: {pdf_path}")
            return None  # Skip missing PDFs

        image_path = pdf_to_image(pdf_path)
        if image_path is None:
            print(f"🚨 Failed to convert: {pdf_path}")
            return None  # Skip if conversion failed

        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"🚨 Image file missing: {image_path}")
            return None

        image = cv2.resize(image, (224, 224))
        image = np.expand_dims(image, axis=0)

        if self.transform:
            image = self.transform(image)

        label = self.genre_mapping.get(row['genres'], 0)
        return torch.tensor(image, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [12]:
def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    return torch.utils.data.dataloader.default_collate(batch) if batch else None

transform = transforms.Compose([
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

CNN model

In [13]:
class GenreCNN(nn.Module):
    def __init__(self, num_classes):
        super(GenreCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(128 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


Training pipeline

In [14]:
# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load dataset
dataset = SheetMusicDataset(csv_file="dataset.csv", pdf_root="archives/pdf", transform=None)

for i in range(10):  
    result = dataset[i]
    if result is None:
        print(f"⚠️ Sample {i} skipped due to missing data")
    else:
        print(f"✅ Sample {i} loaded successfully!")

✅ Sample 0 loaded successfully!
✅ Sample 1 loaded successfully!
✅ Sample 2 loaded successfully!
✅ Sample 3 loaded successfully!
✅ Sample 4 loaded successfully!
✅ Sample 5 loaded successfully!
✅ Sample 6 loaded successfully!
✅ Sample 7 loaded successfully!
✅ Sample 8 loaded successfully!
✅ Sample 9 loaded successfully!


In [15]:
len(dataset)

82786

In [16]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

Training of the model

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = GenreCNN(num_classes=len(dataset.genre_mapping)).to(device)
# model = model.to(device)

model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(dataset.genre_mapping))  # Adjust output layer for genre classification
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\alvga/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 27.6MB/s]

cuda





In [22]:
num_epochs = 10
train_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)

  0%|          | 0/25 [00:11<?, ?it/s]


RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[16, 1, 224, 224] to have 3 channels, but got 1 channels instead

In [None]:
# Plot training & validation loss
plt.figure(figsize=(12, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

Evaluation of the model

In [None]:
model.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        # Store labels and predictions
        all_labels.extend(labels.cpu().numpy())  
        all_predictions.extend(predicted.cpu().numpy())

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Compute Metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Print Results
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")