In [1]:
%cd ..

/home/philipp/ai-audio-enhancer


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio as T
import torchaudio.transforms as TT
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn
from moviepy.editor import *
import wandb
import seaborn as sns
import librosa
import pandas as pd
from tqdm import tqdm
import subprocess
from glob import glob
import random
from scipy.io import wavfile
from pydub import AudioSegment
import shutil
from dataset import build_dataloader
from model import DiffusionUNetModel, DiffusionEmbedding
from config import Config

random.seed(0)
np.random.seed(0)

Convert mp4 to mp3

In [None]:
def mp4_to_aac(in_path, out_path):
    subprocess.run(
        ['ffmpeg', '-y', '-i', in_path, '-c', 'copy', out_path], 
        check=True, 
        stdout=subprocess.DEVNULL, 
        stderr=subprocess.STDOUT)

In [None]:
from torch_receptive_field import receptive_field

model = nn.Sequential(
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
)

size = 44100 * 5
receptive_field(model.cuda(), input_size=(1, size, 1))

In [None]:
def receptive_field(kernel_size, num_layers, dilation_cycle):
    return (kernel_size - 1) * sum(dilation_cycle[i % len(dilation_cycle)] for i in range(num_layers)) + 1

receptive_field(7, 30, [2**i for i in range(0, 9+1)])

Diffusion Models:
x -> x1, x2, xT ~ N(0, I)
Train to reverse noise: x(t) -> x(t-1)

Idea:
Given input clip x, instead of reversing the process directly, add noise to it and then try to reverse the noise.
Hypothesis: The output will not sound like the original but instead more like an improved version of the original but of course more different.
When we add a lot of noise, the output will sound completely different than the original. 
In order to keep the original information, use features from another encoder (for example trained on contrasting cover songs).

Diffusion Model TODOs:
- Dataset
- DataLoader
- Colate function
- Model
- Training loop
- Inference

Memory consumption:
- perform checkpointing

Prototype Requirements:
    - Denoise 5s segments
    - Sampling rate: 44100

Model architecture:
- DiffWave:
    - Each layer has the full output resolution
    - Uses exponential dilation factors to have a receptive field that spans the entire input
    - Problem: consumes a lot of memory
    - Possible solution: trade compute for memory by using checkpointing -> too slow
- U-Net WaveNet:
    - Downsamples the sequence to reduce memory footprint and increase performance
    - Problem: the ear is very sensitive to errors in the high frequencies which are troublesome during the upscaling operations
-> Final decision: 
    - Use U-Net because WaveNet is either way too memory demanding or way too slow when using checkpointing
    - Also, we can try to optimize the hell out of the U-Net architecture (skip connections, attention, etc.)

In [None]:
songs_df = pd.read_csv('scraper/songs/dataset.csv')

In [None]:
sample = songs_df.sample(1000)
sample

In [None]:
root = 'scraper/songs'
for row in tqdm(list(sample.itertuples())):
    src = os.path.join(root, row.path)
    dst = f'denoising/data/{row.id}.mp4'
    shutil.copyfile(src, dst)

In [3]:
data_loader = build_dataloader(
    directory='denoising/data',
    audio_format='mp4',
    batch_size=8,
    audio_length=5 * 44100,
    shuffle=False)

In [4]:
for batch in data_loader:
    print(batch)
    break

tensor([[ 0.0041, -0.0708, -0.0289,  ...,  0.0068, -0.0589,  0.0546],
        [ 0.3062,  0.3439,  0.3755,  ...,  0.2264, -0.2304,  0.3136],
        [ 0.1016,  0.0852,  0.0952,  ...,  0.3542,  0.2962,  0.3777],
        ...,
        [ 0.0127,  0.0119,  0.0119,  ...,  0.0449,  0.0449,  0.0479],
        [ 0.0644,  0.1693,  0.1052,  ...,  0.1355,  0.2810,  0.0739],
        [ 0.1137,  0.1390,  0.1248,  ...,  0.0182,  0.0182, -0.0102]])


In [33]:
config = Config.fromfile('denoising/configs/unet.py')
config.dump('tmp/config.py')

In [9]:
diffusion_embedding = DiffusionEmbedding(
            num_diffusion_steps=50,
            num_channels=32)
embedding = diffusion_embedding(torch.tensor([0, 1, 2]))
embedding.shape

torch.Size([3, 32])