In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Imports

In [5]:
import numpy as np
import os
import cv2 as cv
import matplotlib.pyplot as plt

from easydict import EasyDict as edict
from tqdm import tqdm

print('imported')

imported


## Constants

In [13]:
# Important, to have the same repartition of data between different machines
np.random.seed(42)

# different paths to data: extracted data are images insides folders, weights/ will contain the trained models weights
# with the format weight_{backbone_name}_{optimizer_name}
# stats/ is for stats (loss, accuracy and else)
# npy_data/ is the repo of the data but with .npy format, this helps me to load whole data on the memory
# this is important because accessing drive data directly costs lot of time unlike in local machines

DATA_DIR = './data/segmented_data/' # path of the root project folder
NPZ_DIR = './npz/'

if not os.path.exists(NPZ_DIR):
    os.mkdir(NPZ_DIR)

# getting the list of the classes
class_list = os.listdir(DATA_DIR)
class_list.sort()
print('Number of classes: {}'.format(len(class_list)))

# other parameters
IMG_SIZE = (512, 512)

Number of classes: 38


## Loading data (masks)

In [15]:
sequences, targets = [], []
for class_name in class_list:
    class_dir = os.path.join(DATA_DIR, class_name)
    sequences.extend([os.path.join(class_dir, rep, 'masks') for rep in os.listdir(class_dir)])
    targets.extend([class_name] * len(os.listdir(class_dir)))

# saving the data in npy format
data = edict()
for idx, sequence in enumerate(tqdm(sequences, position=0, leave=True)):
    masks = np.zeros((len(os.listdir(sequence)), *IMG_SIZE), dtype=np.uint8)
    for i, mask in enumerate(os.listdir(sequence)):
        masks[i] = cv.imread(os.path.join(sequence, mask), cv.IMREAD_GRAYSCALE)
        masks[i][masks[i] > 0] = 1
    
    data[sequence] = {'masks': masks, 'target': targets[idx]}

np.savez_compressed(NPZ_DIR + 'data.npz', data)
print('Data saved in npz (compressed) format at {}'.format(NPZ_DIR))

100%|██████████| 364/364 [00:57<00:00,  6.29it/s]


Data saved in npz (compressed) format at ./npz/


## Extracting features from masks

In [None]:
extracted_features = edict()

for seq_name, sequence in tqdm(data.items()):
    for idx, mask in enumerate(sequence['masks']):
        