In [1]:
import os
import glob
import pandas as pd
import numpy as np
import rasterio as rio
import concurrent.futures
import tqdm

In [2]:
DIR_FILES = '/mnt/mount-point-directory/datasets2/'
TRAIN_FRAC = 0.8
SAVE_DIR = './data/'

folder_name = 'data_2'

In [3]:
cities = ['burnaby_5_percent', 'surrey-newton-2021']
# cities = ['birmingham_5_percent']

In [4]:
df_2 = pd.DataFrame(columns = ['city', 'path_img', 'path_road_mask', 'path_driveway_mask', 'path_building_fp'])

In [5]:
def add_entry(df_, city_name, path_imgs, path_road_masks, path_driveway_masks, path_building_fps):
    """
    Add a list of img paths and sw_mask paths to the dataframe.
    In future you may change it to generalize using zip(input_values) to form a new row rather than hardcoding.
    """
    for path_img, path_road_mask, path_driveway_mask, path_building_fp in zip(path_imgs, 
                                                                              path_road_masks, 
                                                                              path_driveway_masks, 
                                                                              path_building_fps):
        new_row = {'city': city_name,
                   'path_img': path_img,
                   'path_road_mask': path_road_mask,
                   'path_driveway_mask': path_driveway_mask,
                   'path_building_fp': path_building_fp
                  }
        df_ = df_.append(new_row, ignore_index=True)
    return df_

In [6]:
for city in cities:
    path_imgs = glob.glob(os.path.join(DIR_FILES, city, 'VRT_driveway', 'tiles', '*.jp2'))
    path_road_masks = glob.glob(os.path.join(DIR_FILES, city, 'mask_road', 'tiles', '*.jp2'))
    path_driveway_masks = glob.glob(os.path.join(DIR_FILES, city, 'mask_driveway', 'tiles', '*.jp2'))
    path_building_fps = glob.glob(os.path.join(DIR_FILES, city, 'mask_building_fp', 'tiles', '*.jp2'))

    # sorting to match files
    path_imgs.sort()
    path_road_masks.sort()
    path_driveway_masks.sort()
    path_building_fps.sort()

    df_2 = add_entry(df_2, city, path_imgs, path_road_masks, path_driveway_masks, path_building_fps)

In [8]:
x = df_2['city'].value_counts()
x = pd.DataFrame(x)
x

Unnamed: 0,city
surrey-newton-2021,183
burnaby_5_percent,20


In [13]:
df_2.head()

Unnamed: 0,city,path_img,path_road_mask,path_driveway_mask,path_building_fp
0,burnaby_5_percent,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...
1,burnaby_5_percent,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...
2,burnaby_5_percent,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...
3,burnaby_5_percent,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...
4,burnaby_5_percent,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...,/mnt/mount-point-directory/datasets2/burnaby_5...


In [14]:
def remove_empty_files(index_):
    """
    function to check and remove the entries where there is no sidewalk
    """
    path_file = df_2['path_driveway_mask'].iloc[index_]      # select the column to filter upon, here 'path_sw_mask'
    src_mask = rio.open(path_file)
    _mask = src_mask.read()
    
    path_file_img = df_2['path_img'].iloc[index_]
    src_img = rio.open(path_file_img)
    _img = src_img.read()
    
    return(not(_mask.any()) or not(_img.any()))

In [15]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    dff = df_2.copy()
    indexes = np.arange(0, len(dff))
    droplist = []
    df_2['drop'] = list(tqdm.tqdm(executor.map(remove_empty_files, indexes.tolist())))

203it [04:25,  1.31s/it]


In [26]:
df_2D = df_2[df_2['drop'] == False]

In [27]:
df_shuffled = df_2D.sample(frac = 1)
df_shuffled = df_shuffled.reset_index(drop = True)

In [28]:
train_split = int(len(df_shuffled)*TRAIN_FRAC)
df_train = (df_shuffled.loc[: train_split])
df_val = (df_shuffled.loc[train_split + 1: ])

In [29]:
def find_entry(df_, city_name):
    """
    Returns a dataframe of the enteries with a particular city_name
    """
    df_ = df_[df_.city == city_name]
    return df_

In [30]:
def add_probability(df_):
    v = df_['city'].value_counts()
    v = pd.DataFrame(v)
    
    probability = []
    for row in df_['city']:
        frequency = v.loc[row][0]
        probability.append(1.0/frequency)
    
    df_ = df_.assign(probability = probability)
    return df_

In [31]:
df_train = add_probability(df_train)
df_val = add_probability(df_val)

In [34]:
if not os.path.isdir(os.path.join(SAVE_DIR, folder_name)):
    os.makedirs(os.path.join(SAVE_DIR, folder_name))

df_train.to_csv(os.path.join(SAVE_DIR, folder_name, 'train.csv'), index_label=False)
df_val.to_csv(os.path.join(SAVE_DIR, folder_name, 'val.csv'), index_label=False)

In [35]:
df_cities = pd.DataFrame(cities, columns = ['cities'])
df_cities.to_csv(os.path.join(SAVE_DIR, folder_name, 'cities.csv'), index_label=False)