In [1]:
from tqdm import tqdm
import numpy as np
import os
import sys

sys.path.append('../')
from batchers import dataset_constants, batcher
from preprocessing.helper import (get_first_feature_map, get_feature_types, print_scalar_values)

In [2]:
BAND_ORDER = ['BLUE', 'GREEN', 'RED', 'SWIR1', 'SWIR2', 'TEMP1', 'NIR', 'NIGHTLIGHTS']
BAND_ORDER_NLSPLIT = ['BLUE', 'GREEN', 'RED', 'SWIR1', 'SWIR2', 'TEMP1', 'NIR', 'DMSP', 'VIIRS']

DATASET = '2009-17'
SIZES = dataset_constants.SIZES
SURVEY_NAMES = dataset_constants.SURVEY_NAMES
MEANS = dataset_constants.MEANS_DICT[DATASET]
STD_DEVS = dataset_constants.STD_DEVS_DICT[DATASET]

In [3]:
train_tfrecord_paths = np.asarray(batcher.get_tfrecord_paths(DATASET, 'train'))
val_tfrecord_paths = np.asarray(batcher.get_tfrecord_paths(DATASET, 'val'))
test_tfrecord_paths = np.asarray(batcher.get_tfrecord_paths(DATASET, 'test'))

In [4]:
def band_keys_for_year(band_keys, year):
    '''
    Args
    - band_keys: list of str, including 'NIGHTLIGHTS'
    - year: numeric
    Returns
    - new_band_keys: copy of band_keys with 'NIGHTLIGHTS' replaced by 'DMSP' or 'VIIRS'
    '''
    new_band_keys = list(band_keys) # make a local copy
    new_band_keys[band_keys.index('NIGHTLIGHTS')] = 'DMSP' if year < 2012 else 'VIIRS'
    return new_band_keys

def get_image_from_file(path):
    feature_map = get_first_feature_map(path)

    label = np.float32(feature_map['wealthpooled'].float_list.value[0])
    lat = np.float32(feature_map['lat'].float_list.value[0])
    lon = np.float32(feature_map['lon'].float_list.value[0])
    country = feature_map['country'].bytes_list.value[0].decode()
    year = int(feature_map['year'].float_list.value[0])
    cluster_index = int(feature_map['cluster_index'].float_list.value[0])
    svyid = feature_map['svyid'].bytes_list.value[0].decode()

    # choose 'DMSP' or 'VIIRS' for nightlights band name based on year
    band_keys_nl = band_keys_for_year(BAND_ORDER, year)

    img_normalized = []
    for b_idx, b_name in enumerate(BAND_ORDER):
        band = np.asarray(feature_map[b_name].float_list.value, dtype=np.float32).reshape(255, 255)
        b = band_keys_nl[b_idx]
        band = (band - MEANS[b]) / STD_DEVS[b]
        img_normalized.append(band)
    img_normalized = np.stack(img_normalized, axis=2)

    return img_normalized, np.array([label, lat, lon, country, year, cluster_index, svyid])

In [8]:
train_imgs1 = []
train_details1 = []
for path in tqdm(train_tfrecord_paths[0:6000]):
    img, details = get_image_from_file(path)
    train_imgs1.append(img)
    train_details1.append(details)

np.savez_compressed("../data/train1.npz", imgs=np.array(train_imgs1), details=np.array(train_details1))

  0%|          | 0/6000 [00:00<?, ?it/s]2023-12-28 00:24:44.549108: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-28 00:24:44.549358: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-12-28 00:24:44.568794: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
  0%|          | 2/6000 [00:00<05:49, 17.14it/s]

Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



 88%|████████▊ | 5306/6000 [03:58<00:31, 22.20it/s]

In [6]:
train_imgs2 = []
train_details2 = []
for path in tqdm(train_tfrecord_paths[6000:12319]):
    img, details = get_image_from_file(path)
    train_imgs2.append(img)
    train_details2.append(details)

np.savez_compressed("../data/train2.npz", imgs=np.array(train_imgs2), details=np.array(train_details2))

  0%|          | 0/6319 [00:00<?, ?it/s]2023-12-28 09:01:04.011019: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-28 09:01:04.013574: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-12-28 09:01:04.147220: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
  0%|          | 1/6319 [00:00<27:43,  3.80it/s]

Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



100%|██████████| 6319/6319 [04:46<00:00, 22.06it/s]


In [11]:
val_imgs = []
val_details = []
for path in tqdm(val_tfrecord_paths):
    img, details = get_image_from_file(path)
    val_imgs.append(img)
    val_details.append(details)

np.savez_compressed("../data/val.npz", imgs=np.array(val_imgs), details=np.array(val_details))

100%|██████████| 3257/3257 [02:22<00:00, 22.79it/s]


In [10]:
test_imgs = []
test_details = []
for path in tqdm(test_tfrecord_paths):
    img, details = get_image_from_file(path)
    test_imgs.append(img)
    test_details.append(details)

np.savez_compressed("../data/test.npz", imgs=np.array(test_imgs), details=np.array(test_details))

100%|██████████| 4093/4093 [03:04<00:00, 22.17it/s]


In [5]:
# train = np.load("../data/train1.npz")
# print(np.shape(train['imgs']))
# print(np.shape(train['details']))

(6000, 255, 255, 8)
(6000, 7)
