In [12]:
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class GenreTransferTransformer(nn.Module):
    def __init__(self, input_dim=128, embed_dim=256, num_heads=8, num_layers=6, num_genres=10):
        super(GenreTransferTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)  # Spectrogram embedding
        self.genre_embedding = nn.Embedding(num_genres, embed_dim)  # Genre embedding

        encoder_layers = TransformerEncoderLayer(embed_dim, num_heads, dim_feedforward=512)
        self.transformer = TransformerEncoder(encoder_layers, num_layers)

        self.output_layer = nn.Linear(embed_dim, input_dim)  # Output transformation

    def forward(self, x, genre_label):
        x = x.permute(2, 0, 1)  # [Time, Batch, Features]
        x = self.embedding(x)

        # Add genre embedding
        genre_emb = self.genre_embedding(genre_label).unsqueeze(0)  # [1, Batch, Embedding]
        x = x + genre_emb  # Apply genre transformation

        x = self.transformer(x)
        x = self.output_layer(x)
        x = x.permute(1, 2, 0)  # Reshape back to [Batch, Features, Time]
        return x

# Initialize model
num_genres = 10  # GTZAN has 10 genres
model = GenreTransferTransformer(num_genres=num_genres)




In [13]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.MSELoss()  # Minimize spectrogram reconstruction error
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, labels)  # Pass genre labels
        loss = criterion(outputs, inputs)  # Compare transformed spectrogram with input

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


Epoch [1/20], Loss: 1809.5393
Epoch [2/20], Loss: 1335.6341
Epoch [3/20], Loss: 899.9405
Epoch [4/20], Loss: 568.0085
Epoch [5/20], Loss: 355.1287
Epoch [6/20], Loss: 241.3066
Epoch [7/20], Loss: 192.7127
Epoch [8/20], Loss: 171.0039
Epoch [9/20], Loss: 162.9325
Epoch [10/20], Loss: 159.7325
Epoch [11/20], Loss: 158.5963
Epoch [12/20], Loss: 157.9837
Epoch [13/20], Loss: 157.9271
Epoch [14/20], Loss: 157.6981
Epoch [15/20], Loss: 157.7058
Epoch [16/20], Loss: 157.6502
Epoch [17/20], Loss: 157.6650
Epoch [18/20], Loss: 157.8492
Epoch [19/20], Loss: 157.6281
Epoch [20/20], Loss: 157.6652


In [17]:
import librosa
import soundfile as sf

def transform_audio(input_file, target_genre):
    model.eval()
    
    # Load and process input audio
    input_spec = load_audio(input_file)
    input_spec = torch.tensor(input_spec, dtype=torch.float32).unsqueeze(0).to(device)  # Add batch dim

    # Convert genre name to index
    genre_index = torch.tensor([genre_to_idx[target_genre]], dtype=torch.long).to(device)

    # Transform the spectrogram
    with torch.no_grad():
        transformed_spec = model(input_spec, genre_index).squeeze(0).cpu().numpy()

    # Convert spectrogram back to audio
    generated_audio = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(transformed_spec))

    # Save output as WAV
    output_file = "transformed_audio.wav"
    sf.write(output_file, generated_audio, 22050)

    print(f"Generated file saved: {output_file}")
    return transformed_spec, output_file

# Example Usage
input_audio = "gtzan_dataset/genres_original/blues/blues.00000.wav"  # Replace with actual file
input_audio = sample_file
target_genre = "jazz"  # Example target genre
transformed_spec, output_audio = transform_audio(input_audio, target_genre)


Generated file saved: transformed_audio.wav


In [18]:
from sklearn.metrics import accuracy_score

def classify_audio(file_path):
    spec = load_audio(file_path)
    spec = torch.tensor(spec, dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(spec, torch.zeros(1, dtype=torch.long).to(device))  # No genre transformation
        predicted_genre_idx = torch.argmax(torch.mean(outputs, dim=-1), dim=-1).cpu().item()

    predicted_genre = GENRES[predicted_genre_idx]
    print(f"Predicted Genre: {predicted_genre}")
    return predicted_genre

# Verify transformed output
predicted_genre = classify_audio(output_audio)


Predicted Genre: country
