# This notebook scans all directories in an input bucket and converts all files to wav

This occurs inplace and deletes the original file and replaces it with a .wav file, It overwrites an existing file by the same name.

It expects input of a "bucket" folder, consisting of subfolders named as species, and the audio files within.
If converting a single folder, create a parent folder to put it into.

This utilises a multi-threaded approach to complete the conversion as fast as possible, but can cause your computer to slow down while runnning, as there is no memory limit/check built in. 

The number of threads are the number of folders in the source_folder. 

### Currently it requires the source directory to have folders in it

### *Use the split into folders notebook to organise sub-folders. It will create many, which speeds up the multi-threaded conversion* 

Audio file formats are
- .mp3
- .flac
- .wav

Video file inputs are
- .mp4
- .m4a

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import pathlib
import glob
import subprocess
import threading 
import time
from IPython.display import clear_output


### Enter the source folder containing subfolders

### This source folder must be a parent directory containing at least 1 folder of audio files to convert

In [None]:
# input parameters
source_folder = "/Volumes/UGREEN Samsung SSD/project echo audio/eBird/Large files"

# options are currently ".mp3" or ".wav" (there is scope to add more)
output_file_type = ".mp3"

In [None]:
# TODO:
# - check if there are multiple folders in source_folder
# - build in a MAX amount of memory allocated


--- 
## Generate list of all folders in the source path

In [None]:
sub_folders = pathlib.Path(source_folder).glob("**/*") #generator object
sub_folders
sub_folder_paths = [x for x in sub_folders if x.is_dir()]
print(len(sub_folder_paths))
sub_folder_paths[:4]

--- 
## Generate list of all audio files paths in the sub-folder

In [None]:
# this creates a new version of the input file, converted to the output format
def convert_file(input_file_path):
    parent_path = pathlib.PurePath(input_file_path).parent    
    file_name = pathlib.Path(input_file_path).stem   
    output_file_path = str(parent_path) + "/" + file_name + output_file_type
    # write the new wav file forcing overwrite 
    subprocess.call(['ffmpeg', '-y', '-i', input_file_path,
                 output_file_path],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.STDOUT)
    return

In [None]:
# check each file type and convert it if necessary
# if creates a new version of the file, then deletes the old
# it skips over the file if it is already in the output format

def check_file_type(audio_file_path):  
    # check this file name and return if it already exists
    if (pathlib.Path(audio_file_path).suffix == output_file_type): return
# audio file type conversions
    if (pathlib.Path(audio_file_path).suffix == ".wav"): convert_file(audio_file_path); # print("wav")
    if (pathlib.Path(audio_file_path).suffix == ".mp3"): convert_file(audio_file_path); # print("mp3")
    if (pathlib.Path(audio_file_path).suffix == ".flac"): convert_file(audio_file_path); # print("flac")
    if (pathlib.Path(audio_file_path).suffix == ".ogg"): convert_file(audio_file_path); # print("ogg")
# video file type conversions:
    if (pathlib.Path(audio_file_path).suffix == ".m4a"): convert_file(audio_file_path)
    if (pathlib.Path(audio_file_path).suffix == ".mp4"): convert_file(audio_file_path)
# delete the old file
    os.remove(audio_file_path)
    return

In [None]:
# this function is called by each thread
# it takes a subfolder within the input directory and iterates through every file within it
# to convert to output format

def threaded_function(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*") #generator object
    audio_files
    audio_file_paths =  [x for x in audio_files if x.is_file()] #PosixPath list
    for x in range(len(audio_file_paths)): 
        check_file_type(audio_file_paths[x])
    print("folder {} done".format(sub_folder_path.stem))
    time.sleep(1)
    clear_output()
    return


In [None]:
# create a list of threads
# the list is currently set to the number of folders

def create_threads(sub_folder_paths):
    max_threads = lambda x : 10 if (x > 10) else x
    num_threads =  max_threads(len(sub_folder_paths))
    num_threads = len(sub_folder_paths)
    thread_list = []
    for i in range(num_threads):
        thread_list.append(threading.Thread(target=threaded_function, args=([sub_folder_paths[i]])))  
    return thread_list


## Main function that converts all files to the specified format
### Multi-threaded function, so expect your computer to grind to a hault for a few minutes

In [None]:
# create a thread for every folder
thread_list = create_threads(sub_folder_paths)
# start converting files in each folder concurrently
for thread in thread_list:
    thread.start()
# wait for all threads to complete
for thread in thread_list:
    thread.join()
print("done all")
print("Conversion of {}\n to format \"{}\" complete!".format(source_folder, output_file_type))