In [85]:
import os
import pandas as pd
from shutil import copyfile
from zipfile import ZipFile
from scripts.globalPaths import ORIGINAL_DATASET_DIR, TARGET_DATASET_DIR
from scripts.auxiliar import set_image_name

In [96]:
def zip_dataset(zip_file_name, PATH, dir_list=None):
    with ZipFile(zip_file_name, 'w') as zip:
        for root, dirs, files in os.walk(PATH):
            basename = os.path.basename(root)
            if not dir_list or basename in dir_list:
                for fn in files:
                    zip.write(os.path.join(root, fn), os.path.join(basename, fn))
    return True

In [13]:
AGE_BUCKETS = ["(0-2)", "(4-6)", "(8-12)", "(15-20)", "(25-32)", "(38-43)", "(48-53)", "(60-100)"]
BINS = [0, 2, 4, 6, 8, 12, 15, 20, 25, 32, 38, 43, 48, 53, 60, 100]

In [15]:
LABELS = []
for i in range(len(BINS)-1):
    LABELS.append(f'({BINS[i]}-{BINS[i+1]})')

In [17]:
def remove_left_zero(target):
    for idx, number in enumerate(target):
        if number != '0':
            return target[idx:]
    return '0'

In [22]:
folder_transform = pd.DataFrame({'folder': os.listdir(ORIGINAL_DATASET_DIR)})
folder_transform['int_folder'] = folder_transform.folder.apply(remove_left_zero).astype(int)
folder_transform.head()

Unnamed: 0,folder,int_folder
0,24,24
1,23,23
2,15,15
3,12,12
4,79,79


In [25]:
folder_transform = folder_transform.assign(buckets=pd.cut(folder_transform.int_folder, bins=BINS, labels=LABELS, include_lowest=True))

In [27]:
folder_transform.head()

Unnamed: 0,folder,int_folder,buckets
0,24,24,(20-25)
1,23,23,(20-25)
2,15,15,(12-15)
3,12,12,(8-12)
4,79,79,(60-100)


In [36]:
folder_transform = folder_transform[folder_transform.buckets.notna()]

In [77]:
for bucket in folder_transform.buckets.unique():
    new_dir = os.path.join(TARGET_DATASET_DIR, bucket)
    try:
        os.mkdir(new_dir)
    except FileExistsError:
        continue

In [76]:
def get_target_name_from_origin(origin_folder, convertion_df):
    found = convertion_df.loc[convertion_df.folder == origin_folder, 'buckets'].values
    if len(found):
        return found[0] 
    else:
        return None

In [75]:
folder_transform.loc[folder_transform.folder == folder, 'buckets'].values[0]

'(20-25)'

In [79]:
images = pd.DataFrame(columns=('folder', 'filename'))
for root, dirs, files in os.walk(ORIGINAL_DATASET_DIR):
    if any(fn.endswith('.png') for fn in files):
        folder = os.path.basename(root)
        for fn in files:
            full_original_file_path = os.path.join(root, fn)
            target_folder = get_target_name_from_origin(folder, folder_transform)
            if target_folder:
                full_target_file_path = os.path.join(TARGET_DATASET_DIR, target_folder, fn)
                copyfile(full_original_file_path, full_target_file_path)
                images.loc[len(images), :] = target_folder, set_image_name(fn, prefix=target_folder)
images.head()

Unnamed: 0,folder,filename
0,(20-25),(20-25)/3949.png
1,(20-25),(20-25)/4767.png
2,(20-25),(20-25)/5445.png
3,(20-25),(20-25)/7520.png
4,(20-25),(20-25)/2128.png


In [84]:
images.to_csv('full_labeled_set.csv', index=False)
images[images.folder.isin(AGE_BUCKETS)].to_csv('bucket_labeled_set.csv', index=False)

In [97]:
zip_dataset('bucket_labeled_dataset.zip', TARGET_DATASET_DIR, dir_list=AGE_BUCKETS)
zip_dataset('full_labeled_dataset.zip', TARGET_DATASET_DIR)

True