<a href="https://colab.research.google.com/github/DataScienceAndEngineering/deep-learning-final-project-project-sidewalk/blob/main/notebooks/cityscape_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import PIL
import imageio as iio
import numpy as np
import shutil
from skimage.transform import resize
import sklearn.model_selection
import subprocess
import pandas as pd
import time

#Notebook takes ~2hrs to run & download processed files

In [None]:
### Obtaining raw data from google drive

from google.colab import drive
from google.colab import files as gfiles
drive.mount('/content/drive')

if not os.path.isdir('/content/rawdata/'):
  os.mkdir('/content/rawdata/')

def unzip(src, dst, preview_console=False):
  #unzips the 'src' file to the 'dst' directory
  #'preview_console' will mirror the command to the console if True
  name = src.split('/')[-1]
  print(f'decompressing {name}')
  cmd = f'unzip -n -q {src} -d {dst}'
  if preview_console:
    print(cmd)
  subprocess.run(cmd, shell=True)

def check_dir_path(path, gen=False):
  if path[-1] != '/':
    path += '/'
  if gen:
    for i in range(2,len(path.split('/'))):
      dir = '/'.join(path.split('/')[0:i])
      if not os.path.isdir(dir):
        print(f'generating: {dir}')
        os.mkdir(dir)
  return path

#Uncompressing the raw data
if not os.path.isdir('/content/rawdata/leftImg8bit/'):
  unzip('/content/drive/MyDrive/tensorflow_datasets/downloads/manual/leftImg8bit_trainvaltest.zip', '/content/rawdata/')
if not os.path.isdir('/content/rawdata/gtFine/'):
  unzip('/content/drive/MyDrive/tensorflow_datasets/downloads/manual/gtFine_trainvaltest.zip', '/content/rawdata/')
if not os.path.isdir('/content/rawdata/disparity/'):
  unzip('/content/drive/MyDrive/tensorflow_datasets/downloads/manual/disparity_trainvaltest.zip', '/content/rawdata/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from scipy.spatial import distance_matrix
### Data Extraction (finding samples with sidewalks)

def find_sidewalks(labeldir):
  # Looks through the provided directory for labels including sidewalk segmentations
  # Returns list of filenames which contain valid sidewalk segmentations 
  labeldir = check_dir_path(labeldir)
  files = []
  subsets = ['train', 'val', 'test']
  for group in subsets:
    Dir = f'{labeldir}{group}'
    Dir = check_dir_path(Dir)
    for k in os.listdir(Dir):
      Dir2 = f'{Dir}{k}'
      Dir2 = check_dir_path(Dir2)
      files_list = os.listdir(Dir2)
      files_list = [f for f in files_list if f.endswith('labelIds.png')]
      for f in files_list:
        if 8 in np.array(PIL.Image.open(Dir2+f)):
          files.append(f'{group}/{k}/{f}')
  print(f'Identified {len(files)} samples containing sidewalks')
  return files
  #return [files.split('/')[-1], '/'.join(files.split('/')[-2:-4])]

def grab_files(src, dst, files, purge=False):
  # Moves all files listed in 'files' from 'src' directory to 'dst' directory
  # Builds 'dst' directory if necessary
  src = check_dir_path(src)
  dst = check_dir_path(dst, gen=True)
  print(f'Moving {len(files)} files from {src} to {dst}')
  for f in files:
    name = f.split('/')[-1]
    os.rename(f'{src}{f}', f'{dst}{name}')
    time.sleep(0.01)
  if purge:
    shutil.rmtree(src)

def download_dir(src, dst, name, preview_console=False):
  # Download the 'src' directory as a zip file with 'name' to the 'dst' directory
  src = check_dir_path(src)
  dst = check_dir_path(dst)
  cmd = f'zip -r {dst}{name}.zip {src}'
  if preview_console:
    print(cmd)
  subprocess.run(cmd, shell=True)
  gfiles.download(f'{dst}{name}.zip')

In [4]:
### Extracting Data
def main_extract(label_dir='/content/rawdata/gtFine/', image_dir='/content/rawdata/leftImg8bit/', depth_dir='/content/rawdata/disparity/', purge=False):
  files_label = find_sidewalks(label_dir)
  print(f'Extracting {len(files_label)} samples...')
  files_image = [i.replace('gtFine_labelIds', 'leftImg8bit') for i in files_label]
  files_depth = [i.replace('gtFine_labelIds', 'disparity') for i in files_label]
  grab_files(label_dir, '/content/extracted/labels', files_label, purge=purge)
  grab_files(image_dir, '/content/extracted/images', files_image, purge=purge)
  grab_files(depth_dir, '/content/extracted/disparity', files_depth, purge=purge)

In [5]:
from pandas._libs.lib import fast_unique_multiple_list_gen
## Data Processing and Exporting
def process_images(src, dst, shape=(256, 512, 3), purge=False):
  #Preprocessing of the images, resizing to standard size
  print('Starting image processing...')
  src = check_dir_path(src)
  dst = check_dir_path(dst, gen=True)
  files = os.listdir(src)
  print(f'found {len(files)} images')
  i=0
  for f in files:
    image = iio.imread(src + f)
    image = resize(image, shape)
    np.save(f'{dst}{f[:-16]}.png', image)
    #time.sleep(0.1)
    i+=1
    if i%500 == 0: print(f'{i}/{len(files)}')
  print(f'Image processing completed, processed {i} files')

def process_masks(src, dst, shape=(256, 512, 1), id=8, name='sidewalk', purge=False):
  # Preprocessing of the masks: removes all segmentations except sidewalks
  # REsizes masks to standard size
  src = check_dir_path(src)
  dst = check_dir_path(dst)+name
  dst = check_dir_path(dst, gen=True)
  files = os.listdir(src)
  print(f'found {len(files)} masks')
  i = 0
  for f in files:
    mask = iio.imread(src+f)
    mask = resize(mask, shape)
    mask = np.array(mask == id).astype('uint8') #Binarize as last step
    f = f[:-20]+'.png'
    np.save(dst+f, mask)
    time.sleep(0.01)
    i += 1
  print(f'{name} mask processing completed, processed {i} files')

def process_depth(src, dst, shape=(256, 512, 1), purge=False):
  print('Starting depth processing...')
  src = check_dir_path(src)
  dst = check_dir_path(dst, gen=True)
  files = os.listdir(src)
  print(f'found {len(files)} depth masks')
  i = 0
  for f in files:
    depth = iio.imread(src+f)
    depth = resize(depth, shape)
    depth = depth/np.max(depth)
    f = f[:-14]+'.png'
    np.save(dst+f, depth)
    time.sleep(0.01)
    i+=1
  print(f'Depth processing completed, processed {i} files')

def main_process(image_dir='/content/extracted/images/', mask_dir='/content/extracted/labels/', depth_dir='/content/extracted/disparity/', dst='/content/processed/', purge=False):
  #Processes all extracted files, zips images and labels, then downloads processed data
  print('Starting processing...')
  dst = check_dir_path(dst)
  out_dir = dst+'images/'
  out_dir = check_dir_path(out_dir, gen=True)
  process_images(image_dir, out_dir, purge=purge)

  #List of segmentation IDs to be extracted into masks, can be updated
  ID_log = {
      8:'sidewalk',
      11:'building',
      12:'wall',
      13:'fence',
      14:'guard_rail',
      15:'bridge',
      16:'tunnel',
      21:'vegetation',
      17:'pole',
      18:'polegroup'}
  out_dir = dst+'masks/'
  out_dir = check_dir_path(out_dir, gen=True)
  print('Starting mask processing...')
  for i in ID_log.keys():
    process_masks(src=mask_dir, dst=out_dir, id=i, name=ID_log[i])
  if purge:
    shutil.rmtree(mask_dir)

  out_dir = dst+'depth/'
  out_dir = check_dir_path(out_dir, gen=True)
  process_depth(src=depth_dir, dst=out_dir, purge=purge)
  
  filesim = os.listdir(dst+'images/')
  filesms = os.listdir(dst+'masks/pole/')
  filesde = os.listdir(dst+'depth/')
  print(f'{len(filesim)} images post-processing')
  print(f'{len(filesms)} masks post-processing (pole directory)')
  print(f'{len(filesde)} depth images post-processing')

In [None]:
### Fix to allow for splitting into data partitions
def move_subdir(src, files, subdir):
  src = check_dir_path(src)
  dst = check_dir_path(src+subdir, gen=True)
  for f in files[1:]:
    try:
      os.rename(f'{src}{f}', f'{dst}{f}')
    except:
      continue
      #print(f'missing file {f}')

def split_dir(src, dict_path):
  src = check_dir_path(src)
  dict_path = check_dir_path(dict_path)
  groups = ['val', 'test', 'train']
  for i in groups:
    files = list(pd.read_csv(f'{dict_path}{i}.csv', header=None)[1])
    move_subdir(src, files, i)

def apply_split(src='/content/processed/', dict_path='/content/drive/MyDrive/tensorflow_datasets/', download=False):
  src = check_dir_path(src)
  subdirs = os.listdir(src)
  for i in subdirs:
    src2 = src+i
    src2 = check_dir_path(src2)
    if i == 'masks':
      subsubdirs = os.listdir(src2)
      for k in subsubdirs:
        src3 = check_dir_path(src2+k)
        split_dir(src3, dict_path)
    else:
      split_dir(src2, dict_path)
  if download:
    for i in subdirs:
      src_ = check_dir_path(src + i)
      download_dir(src, '/content/drive/MyDrive/', i)
#def split_files():
#  files = os.listdir('/content/processed/images/')
#  print(f'splitting total of {len(files)}')
#  train, test_tmp = sklearn.model_selection.train_test_split(files, test_size=.2, train_size=.8, shuffle=True)
#  test, val = sklearn.model_selection.train_test_split(test_tmp, test_size=.5, train_size=.5) 

In [None]:
if __name__ == "__main__":
  main_extract()
  main_process()
  apply_split(download=True)