In [1]:
%cd ..

/home/philipp/ai-audio-enhancer


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio as T
import torchaudio.transforms as TT
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn
from moviepy.editor import *
import wandb
import seaborn as sns
import librosa
import pandas as pd
from tqdm import tqdm
import subprocess
from glob import glob
import random
from scipy.io import wavfile
from pydub import AudioSegment
import shutil
from dataset import build_dataloader
from model import DiffusionModel

random.seed(0)
np.random.seed(0)

Convert mp4 to mp3

In [None]:
def mp4_to_aac(in_path, out_path):
    subprocess.run(
        ['ffmpeg', '-y', '-i', in_path, '-c', 'copy', out_path], 
        check=True, 
        stdout=subprocess.DEVNULL, 
        stderr=subprocess.STDOUT)

In [None]:
audio, sr = librosa.load('tmp/test.aac', mono=True, sr=22050)
audio

In [None]:
from torch_receptive_field import receptive_field

model = nn.Sequential(
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
    nn.Conv2d(1, 1, kernel_size=7, stride=2, padding=3),
    nn.Conv2d(1, 1, kernel_size=7, stride=1, padding=6, dilation=2),
)

size = 44100 * 5
receptive_field(model.cuda(), input_size=(1, size, 1))

In [None]:
def receptive_field(kernel_size, num_layers, dilation_cycle):
    return (kernel_size - 1) * sum(dilation_cycle[i % len(dilation_cycle)] for i in range(num_layers)) + 1

receptive_field(7, 30, [2**i for i in range(0, 9+1)])

Diffusion Models:
x -> x1, x2, xT ~ N(0, I)
Train to reverse noise: x(t) -> x(t-1)

Idea:
Given input clip x, instead of reversing the process directly, add noise to it and then try to reverse the noise.
Hypothesis: The output will not sound like the original but instead more like an improved version of the original but of course more different.
When we add a lot of noise, the output will sound completely different than the original. 
In order to keep the original information, use features from another encoder (for example trained on contrasting cover songs).

Diffusion Model TODOs:
- Dataset
- DataLoader
- Colate function
- Model
- Training loop
- Inference

Memory consumption:
- perform checkpointing

Prototype Requirements:
    - Denoise 5s segments
    - Sampling rate: 44100

Model architecture:
- DiffWave:
    - Each layer has the full output resolution
    - Uses exponential dilation factors to have a receptive field that spans the entire input
    - Problem: consumes a lot of memory
    - Possible solution: trade compute for memory by using checkpointing -> too slow
- U-Net WaveNet:
    - Downsamples the sequence to reduce memory footprint and increase performance
    - Problem: the ear is very sensitive to errors in the high frequencies which are troublesome during the upscaling operations
-> Final decision: 
    - Use U-Net because WaveNet is either way too memory demanding or way too slow when using checkpointing
    - Also, we can try to optimize the hell out of the U-Net architecture (skip connections, attention, etc.)

In [13]:
songs_df = pd.read_csv('scraper/songs/dataset.csv')

In [14]:
sample = songs_df.sample(1000)
sample

Unnamed: 0,bitrate,audio_codec,filesize,id,title,artist,video_title,url,length,views,result_index,path
19805,50409,mp4a.40.5,1530732,a870d462-928b-4edc-8c2a-26a9245df71a,bold as love,john mayer,"Bold As Love (Jimi Hendrix, John Mayer Ver.) G...",https://youtube.com/watch?v=Ve80E111wpA,257,13693,6,john mayer/bold as love/a870d462-928b-4edc-8c2...
46732,49874,mp4a.40.5,1429373,d99b4de5-3ff9-40e6-81b9-0f7df3f5be56,ese momento,luis miguel,Robert Sarkozi - Ese Momento - Luis Miguel (co...,https://youtube.com/watch?v=HHLOupF7vuA,234,344,2,luis miguel/ese momento/d99b4de5-3ff9-40e6-81b...
7529,50144,mp4a.40.5,2456195,db84417c-39a6-46d0-a67a-6e632dcbb08b,animal magnetism,scorpions,Wynjara - Animal Magnetism (Scorpions cover),https://youtube.com/watch?v=vwYaZr_eIEE,403,3341,3,scorpions/animal magnetism/db84417c-39a6-46d0-...
114946,50004,mp4a.40.5,999796,7861a049-286f-4a85-a461-205ebb456b1c,normal girl,bryson tiller,Bryson Tiller - Normal Girl (Slowed + Reverb) ...,https://youtube.com/watch?v=mhzxopBReaQ,164,7668,4,bryson tiller/normal girl/7861a049-286f-4a85-a...
59988,50024,mp4a.40.5,1302801,f0c15db3-76d4-4a46-96a0-99a458907ecd,glued,melanie martinez,Melanie Martinez - Glued - Cover & Music Video,https://youtube.com/watch?v=fjSuyNmjYvM,213,3203,3,melanie martinez/glued/f0c15db3-76d4-4a46-96a0...
...,...,...,...,...,...,...,...,...,...,...,...,...
88855,49301,mp4a.40.5,1277094,9ccaca63-003a-4cf1-9e3f-80f2e1c1f5d4,latch,disclosure,Latch - Disclosure ft. Sam Smith (Hannah Trigw...,https://youtube.com/watch?v=3Y4SpglXbBE,211,757046,3,disclosure/latch/9ccaca63-003a-4cf1-9e3f-80f2e...
100363,49962,mp4a.40.5,1538703,27eff101-a73c-4a43-ac49-e7c5c53deac5,man on the edge,iron maiden,"Iron Maiden - ""Man On The Edge"" cover",https://youtube.com/watch?v=wuyh5wcJeJA,252,8131,1,iron maiden/man on the edge/27eff101-a73c-4a43...
49370,49990,mp4a.40.5,1322613,32d67110-4eaf-4930-8813-bb4b9f68be60,fall again,michael jackson,GABI - Fall Again Michael Jackson (cover),https://youtube.com/watch?v=AHjyIap8yYY,217,5838,1,michael jackson/fall again/32d67110-4eaf-4930-...
52113,49858,mp4a.40.5,551903,852ddc9f-ea7e-4e18-b41c-8a78afa8e835,fire flies,gorillaz,Fire Flies - Gorillaz (Cover by Eggo),https://youtube.com/watch?v=-WjRPRU5Prc,92,479,6,gorillaz/fire flies/852ddc9f-ea7e-4e18-b41c-8a...


In [15]:
root = 'scraper/songs'
for row in tqdm(list(sample.itertuples())):
    src = os.path.join(root, row.path)
    dst = f'denoising/data/{row.id}.mp4'
    shutil.copyfile(src, dst)

100%|██████████| 1000/1000 [00:02<00:00, 426.28it/s]


In [None]:
paths = glob('denoising/data/*.aac')
sampling_rates = []
for path in tqdm(paths):
    audio = AudioSegment.from_file(path, 'aac')
    sampling_rates.append(audio.frame_rate)
sampling_rates

In [10]:
sum(sr != 44100 for sr in sampling_rates)

0

In [6]:
print(audio.frame_rate)
samples = np.array(audio.get_array_of_samples())
samples.shape

44100


(16785408,)

In [29]:
data_loader = build_dataloader(
    directory='denoising/data',
    audio_format='mp4',
    batch_size=8,
    audio_length=5 * 44100,
    shuffle=False)

In [30]:
for batch in data_loader:
    print(batch)
    break

tensor([[-0.2237, -0.1564, -0.2192,  ..., -0.1639, -0.3018, -0.2074],
        [-0.3581, -0.3445, -0.3942,  ...,  0.1373,  0.0517,  0.0770],
        [ 0.3277,  0.2013,  0.2853,  ..., -0.1835, -0.3482, -0.1794],
        ...,
        [ 0.0188,  0.0188,  0.0204,  ...,  0.0843,  0.0656,  0.0656],
        [-0.2343, -0.1724, -0.1772,  ..., -0.3191, -0.1817, -0.3513],
        [-0.0471, -0.0508, -0.0480,  ...,  0.0244,  0.0244,  0.0200]])
