## VOCANO: Transcribing singing vocal notes in polyphonic music using semi-supervised learning
Transcribe vocal wav files into midi.

## Requirements
Please execute the following code to prepare for transcription.

In [None]:
#@title Cloning from github

!git clone https://github.com/B05901022/VOCANO.git

In [None]:
#@title Moving to directory

%cd VOCANO

In [None]:
#@title Script for downloading apex

%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --disable-pip-version-check --no-cache-dir ./apex

In [None]:
#@title Download apex (May take a couple minutes)

!sh setup.sh

In [None]:
#@title Download other prerequisites

!pip install -r requirements.txt -f https://download.pytorch.org/whl/torch_stable.html

## Uploading Files for Transcription
Please use one of the provided ways to upload your vocal file for transcription.

In [None]:
#@title Upload MP3 (Please allow all cookies to avoid errors)

from google.colab import files

ff = files.upload()
uploaded_audio = list(ff.keys())[0]

!test -f "$uploaded_audio".wav && rm "$uploaded_audio".wav
!ffmpeg -i "$uploaded_audio" "$uploaded_audio".wav

uploaded_audio = f"{uploaded_audio}.wav"

In [None]:
#@title Upload WAV (Please allow all cookies to avoid errors)

from google.colab import files

ff = files.upload()
uploaded_audio = list(ff.keys())[0]

## Transcription
Main script for singing voice transcription.

In [None]:
#@title Transcription

import argparse
from pathlib import Path
from vocano.core import SingingVoiceTranscription

file_name = input('Output file name:')

args = argparse.Namespace()
args.name = file_name
args.feat_dir = Path('./generated/feat')
args.pitch_dir = Path('./generated/pitch')
args.midi_dir = Path('./generated/midi')
args.output_wav_dir = Path('./generated/wav')
args.wavfile_dir = Path(uploaded_audio)
args.pitch_gt_dir = Path('groundtruth/pitch.npy')
args.checkpoint_file = './checkpoint/model.pt'
args.save_extracted = False
args.use_pre_extracted = False
args.use_groundtruth = False
args.device = '0'
args.use_cp = True
args.batch_size = 64
args.num_workers = 0
args.pin_memory = True
args.amp_level = 'O0'

solver = SingingVoiceTranscription(args)

solver.transcription()