In [1]:
import sys
import os
import zipfile
import re
import math
from tqdm import tqdm_notebook

# Allows me to import my modules
sys.path.append('./modules')
from audio_utils import *

# A common split for data is 60% training, 20% validation and 20% testing to minimise error of training (too few training data) and error of testing (too few testing); data isn't used more than once to prevent overfitting.
training, validation, test = 0.6, 0.2, 0.2
d = math.gcd(math.gcd(int(training*100), int(validation*100)), int(test*100)) # Greatest common divisor of ^ *100
training_bound, validation_bound, test_bound = training*100/d, validation*100/d, test*100/d
s = training_bound + validation_bound + test_bound

Attempting to read settings file...
	Read successfully!


In [4]:
datatypes = ["cropped_data", "multiclassed_data", "multiclassed_data_no_augs", "pre_multiclass_augmented_data", "post_multiclass_augmented_data", "post_multiclass_augmented_data_no_pre", "pre_multiclass_augmented_data_bassdrum"]

In [5]:
zip_dir = os.path.join(os.getcwd(), "zipped_data")
zip_archives = os.listdir(zip_dir)
for archive in zip_archives:
    archive_type = "_".join(archive.split("_")[:-2])
    if archive_type in datatypes:
        print(archive)
        try:
            with zipfile.ZipFile(os.path.join(zip_dir, archive), mode="r") as archive:
                print("Openned archive.")
        except:
            print("Failed to open archive.")
        

cropped_data_0_2019-02-25.zip
Openned archive.
cropped_data_0_2019-03-19.zip
Openned archive.
multiclassed_data_0_2019-02-25.zip
Openned archive.
multiclassed_data_0_2019-03-19.zip
Openned archive.
multiclassed_data_no_augs_0_2019-02-25.zip
Openned archive.
multiclassed_data_no_augs_0_2019-03-19.zip
Openned archive.
post_multiclass_augmented_data_0_2019-03-07.zip
Openned archive.
post_multiclass_augmented_data_2_2019-03-19.zip
Openned archive.
post_multiclass_augmented_data_no_pre_0_2019-04-16.zip
Openned archive.
pre_multiclass_augmented_data_0_2019-02-25.zip
Openned archive.
pre_multiclass_augmented_data_0_2019-03-27.zip
Openned archive.
pre_multiclass_augmented_data_bassdrum_0_2019-04-22.zip
Openned archive.


In [6]:
i = 0
zip_dir = os.path.join(os.getcwd(), "zipped_data")
zip_archives = os.listdir(zip_dir)
for archive in tqdm_notebook(zip_archives, total=len(zip_archives), desc="Reading archives"):
    archive_type = "_".join(archive.split("_")[:-2])
    if archive_type in datatypes:
        try:
            with zipfile.ZipFile(os.path.join(zip_dir, archive), mode="r") as archive:
                print("Current archive: {}".format(archive))
                archive_files = list(filter(lambda x: "." in re.split("\/", x.filename)[-1], archive.infolist()))
                for fileinfo in tqdm_notebook(archive_files, total=len(archive_files), desc="Files extracted"):
                    # Choosing data_set between "training", "validation" & "test".
                    data_set = "training_data"
                    if (training_bound <= i%s and (training_bound+validation_bound) > i%s):
                        data_set = "validation_data"
                    elif ((training_bound+validation_bound) <= i%s):
                        data_set = "test_data"
                    target_path = os.path.join(os.getcwd(), "audio_data", data_set)
                    path_components = re.split("\/", fileinfo.filename)
                    hlkltl = path_components[-4:-1]
                    if ("." in path_components[-1]):
                        uid, ext = path_components[-1].split(".")
                        fp = os.path.join(target_path, *hlkltl, uid+"."+ext)
                        subdirs = os.path.join(target_path, *hlkltl)
                        if not os.path.exists(subdirs):
                            os.makedirs(subdirs)
                        while (os.path.exists(fp)):
                            uid = str(int(uid)+1)
                            fp = os.path.join(target_path, *hlkltl, uid+"."+ext)
                        data = archive.read(fileinfo)
                        with open(fp, "wb") as write_location:
                            write_location.write(data)
                        i += 1
                    else:
                        continue
        except:
            print(*sys.exc_info())

HBox(children=(IntProgress(value=0, description='Reading archives', max=19, style=ProgressStyle(description_wi…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\cropped_data_0_2019-02-25.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=922, style=ProgressStyle(description_wi…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\cropped_data_0_2019-03-19.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=147, style=ProgressStyle(description_wi…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\multiclassed_data_0_2019-02-25.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=5992, style=ProgressStyle(description_w…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\multiclassed_data_0_2019-03-19.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=2697, style=ProgressStyle(description_w…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\multiclassed_data_no_augs_0_2019-02-25.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=107, style=ProgressStyle(description_wi…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\multiclassed_data_no_augs_0_2019-03-19.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=87, style=ProgressStyle(description_wid…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\post_multiclass_augmented_data_0_2019-03-07.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=116760, style=ProgressStyle(description…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\post_multiclass_augmented_data_2_2019-03-19.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=76520, style=ProgressStyle(description_…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\post_multiclass_augmented_data_no_pre_0_2019-04-16.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=2140, style=ProgressStyle(description_w…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\pre_multiclass_augmented_data_0_2019-02-25.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=13640, style=ProgressStyle(description_…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\pre_multiclass_augmented_data_0_2019-03-27.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=18228, style=ProgressStyle(description_…

Current archive: <zipfile.ZipFile filename='D:\\Documents\\University\\Year3\\Independant Studies\\zipped_data\\pre_multiclass_augmented_data_bassdrum_0_2019-04-22.zip' mode='r'>


HBox(children=(IntProgress(value=0, description='Files extracted', max=43000, style=ProgressStyle(description_…


