In [5]:
# ResNet3Small adaptado a discriminador
%pip install tensorflow[and-cuda]
%pip install torch

Note: you may need to restart the kernel to use updated packages.




Collecting torch
  Downloading torch-2.3.0-cp39-cp39-win_amd64.whl.metadata (26 kB)
Collecting filelock (from torch)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting jinja2 (from torch)
  Downloading Jinja2-3.1.3-py3-none-any.whl.metadata (3.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading tbb-2021.12.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Downloading torch-2.3.0-cp39-cp39-win_amd64.whl (159.7 MB)
   ---------------------------------------- 0.0/159.7 MB ? eta -:--:--
   ---------------------------------------- 0.2/159.7 MB 5.9 MB/s eta 0:00:27
   --------------

In [6]:
%pip install torchvision

Collecting torchvision
  Downloading torchvision-0.18.0-cp39-cp39-win_amd64.whl.metadata (6.6 kB)
Downloading torchvision-0.18.0-cp39-cp39-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
    --------------------------------------- 0.0/1.2 MB 320.0 kB/s eta 0:00:04
   ------- -------------------------------- 0.2/1.2 MB 2.3 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 9.3 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.18.0
Note: you may need to restart the kernel to use updated packages.


# Discriminator

In [171]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += self.shortcut(residual)
        out = self.relu(out)
        return out

class ResNetDiscriminator(nn.Module):
    def __init__(self, in_channels=1, num_classes=1):
        super(ResNetDiscriminator, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(16, 16, blocks=1, stride=1)
        self.layer2 = self.make_layer(16, 32, blocks=2, stride=2)
        self.layer3 = self.make_layer(32, 64, blocks=2, stride=2)
        self.layer4 = self.make_layer(64, 128, blocks=2, stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, num_classes)

    def make_layer(self, in_channels, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(ResidualBlock(in_channels, out_channels, stride))
            in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return torch.sigmoid(x)

# Test the model
if __name__ == "__main__":
    # Create an instance of the ResNet discriminator
    discriminator = ResNetDiscriminator(in_channels=1, num_classes=1)
    # Generate random input tensor with shape (batch_size, channels, height, width)
    input_tensor = torch.randn(16, 1, 96, 96)
    # Forward pass
    output = discriminator(input_tensor)
    # Print output shape
    print("Output shape:", output.shape)


Output shape: torch.Size([16, 1])


# Generator

In [172]:
# Generador
import torch
from torch import nn
import numpy as np

class Generator(nn.Module):
    def __init__(self, img_shape, latent_dim, n_classes):
        super(Generator, self).__init__()
        self.img_shape = img_shape
        self.label_emb = nn.Embedding(n_classes, n_classes)

        def block(in_feat, out_feat, normalize=True):
            layers = [nn.Linear(in_feat, out_feat)]
            if normalize:
                layers.append(nn.BatchNorm1d(out_feat, 0.8))
            layers.append(nn.LeakyReLU(0.2, inplace=True))
            return layers

        self.model = nn.Sequential(
            *block(latent_dim + n_classes, 128, normalize=False),
            *block(128, 256),
            *block(256, 512),
            *block(512, 1024),
            nn.Linear(1024, int(np.prod(self.img_shape))),
            nn.Tanh()
        )

    def forward(self, noise, labels):
        # Concatenate label embedding and noise to produce input
        gen_input = torch.cat((self.label_emb(labels), noise), -1)
        img = self.model(gen_input)
        img = img.view(img.size(0), *self.img_shape)
        return img




In [173]:
import os
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
import torch

class MusicSymbolsDataset(Dataset):
    def __init__(self, directories, transform=None):
        self.transform = transform
        self.image_labels = []
        self.class_names = []  # Lista para almacenar los nombres de las clases
        for directory in directories:
            self.load_images_and_labels(directory)

    def load_images_and_labels(self, root_dir):
        for subdir in sorted(os.listdir(root_dir)):
            subdir_path = os.path.join(root_dir, subdir)
            if os.path.isdir(subdir_path):
                if subdir.lower() not in self.class_names:
                    self.class_names.append(subdir.lower())
                class_index = self.class_names.index(subdir.lower())
                for image_filename in os.listdir(subdir_path):
                    image_path = os.path.join(subdir_path, image_filename)
                    if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
                        self.image_labels.append((image_path, class_index))

    def __len__(self):
        return len(self.image_labels)

    def __getitem__(self, idx):
        image_path, class_index = self.image_labels[idx]
        image = Image.open(image_path).convert('L')
        if self.transform:
            image = self.transform(image)
        return image, class_index

# Transformation
transform = transforms.Compose([
    transforms.Resize((96, 96)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Directorios de datasets
directories = ['./data/images', './dataset1', './dataset2', './data/open_omr_raw']
dataset = MusicSymbolsDataset(directories=directories, transform=transform)

# Dividir los datos
train_size = int(0.7 * len(dataset))
valid_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

print(f"Total images: {len(dataset)}")
print(f"Train images: {len(train_dataset)}, Validation images: {len(valid_dataset)}, Test images: {len(test_dataset)}")
print(f"Class Names: {dataset.class_names}")
print(f"Number of classes: {len(dataset.class_names)}")


Total images: 41374
Train images: 28961, Validation images: 6206, Test images: 6207
Class Names: ['other', 'quarter-note', 'quarter-rest', 'repeat-measure', 'segno', 'sharp', 'sixteenth-note', 'sixteenth-rest', 'sixty-four-note', 'sixty-four-rest', 'staccatissimo', 'stopped', 'tenuto', 'thirty-two-note', 'thirty-two-rest', 'tie-slur', 'trill', 'trill-wobble', 'tuplet', 'turn', 'volta', 'whole-half-rest', 'whole-note', 'test', 'training', 'validation', 'accent', 'barline', 'beam', 'c-clef', 'common-time', 'cut-time', 'dot', 'eighth-grace-note', 'eighth-note', 'eighth-rest', 'f-clef', 'flat', 'g-clef', 'half-note', 'multiple-eighth-notes', 'multiple-half-notes', 'multiple-quarter-notes', 'multiple-sixteenth-notes', 'natural', '1-8-time', '12-8-time', '2-4-time', '2-8-time', '3-4-time', '3-8-time', '4-2-time', '4-4-time', '4-8-time', '5-4-time', '5-8-time', '6-4-time', '6-8-time', '7-4-time', '8-8-time', '9-8-time', 'breve', 'chord', 'double-whole-rest', 'fermata', 'glissando', 'marcato',

In [174]:
import torch

# Define el dispositivo como GPU si CUDA está disponible, de lo contrario usa CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [180]:
import torch.optim as optim

# Creamos el generador y el discriminador
generator = Generator(img_shape=(1, 96, 96), latent_dim=100, n_classes=80).to(device)
discriminator = ResNetDiscriminator(in_channels=1, num_classes=1).to(device)

# Definimos los optimizadores
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

# Definimos la función de pérdida
criterion = torch.nn.BCELoss()

# Definimos los hiperparámetros
epochs = 5
batch_size = 16
latent_dim = 100

# Entrenamiento
for epoch in range(epochs):
    for i, (real_images, _) in enumerate(train_loader):
        real_images = real_images.to(device)  # Movemos las imágenes reales al dispositivo
        real_labels = torch.ones(batch_size, 1, device=device)

        # Entrenamiento del discriminador
        discriminator.zero_grad()
        output_real = discriminator(real_images)
        d_loss_real = criterion(output_real, real_labels)

        noise = torch.randn(batch_size, latent_dim, device=device)
        fake_classes = torch.randint(0, 10, (batch_size,), device=device)
        fake_images = generator(noise, fake_classes).detach()  # Detach para no actualizar el generador
        fake_labels = torch.zeros(batch_size, 1, device=device)
        output_fake = discriminator(fake_images)
        d_loss_fake = criterion(output_fake, fake_labels)

        d_loss = (d_loss_real + d_loss_fake) / 2
        d_loss.backward()
        optimizer_D.step()

        # Entrenamiento del generador
        generator.zero_grad()
        trick_labels = torch.ones(batch_size, 1, device=device)
        output = discriminator(fake_images)
        g_loss = criterion(output, trick_labels)
        g_loss.backward()
        optimizer_G.step()

        if i % 100 == 0:
            print(f"Epoch [{epoch}/{epochs}] Batch {i}/{len(train_loader)//batch_size} "
                  f"Loss D: {d_loss.item()}, Loss G: {g_loss.item()}")

# Guardamos los modelos entrenados
torch.save(generator.state_dict(), 'generator.pth')
torch.save(discriminator.state_dict(), 'discriminator.pth')


Epoch [0/5] Batch 0/113 Loss D: 0.6998176574707031, Loss G: 0.6569993495941162


KeyboardInterrupt: 