In [1]:
%load_ext autoreload
%autoreload 2

# change directory to repo root, and verify
%cd '../'
!pwd

/cephyr/users/markpett/Alvis/satellite_poverty_prediction
/cephyr/users/markpett/Alvis/satellite_poverty_prediction


In [2]:
import data_handling
import numpy as np
import tensorflow as tf
import json
import os
import pickle

data_dir = '/mimer/NOBACKUP/groups/globalpoverty1/data'
fold_config = 'incountry'
model_fold = 'A'

2022-10-17 09:41:24.684497: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


In [3]:
# get all tfrecords
tfrecord_files = np.asarray(data_handling.create_full_tfrecords_paths(data_dir))

# get train, val, test fold
folds_file_path = os.path.join(data_dir, 'folds', fold_config + '.pkl')
with open(folds_file_path, 'rb') as pickle_file:
    cv_indices = pickle.load(pickle_file)

# get band stats
stats_file_path = os.path.join(data_dir, 'band_stats', fold_config + '.json')
with open(stats_file_path) as band_stats_file:
    all_folds_band_stats = json.load(band_stats_file)
band_stats = all_folds_band_stats[model_fold]

In [11]:
def get_hist(img, label):
    bands = tf.reshape(img, (224 * 224, 8)).numpy()
    img_hist = np.apply_along_axis(lambda band: np.histogram(band, bins=band_bin_edges)[0], 0, bands).T
    return img_hist, label.numpy()

def get_hists(ds):
    band_bin_edges = np.concatenate([
        [-1e5],
        np.arange(0, 1, 0.01),
        [1e5]
    ])

    img_hists = []
    labels = []
    sample_indices = []

    for x, y in ds.take(100):
        i = x['frame_index']
        img = x['model_input'][i]
        img_hist, label = get_hist(img, y)
        img_hists.append(img_hist)
        labels.append(label)
        sample_indices.append(x['sample_index'])
        
    img_hists = np.stack(img_hists)
    labels = np.asarray(labels)
    sample_indices = np.asarray(sample_indices)
    
    return img_hists, labels, sample_indices

def save_fold_hists(save_dir, fold, hists, labels, indices):
    # Create save_dir if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Create dir for fold
    fold_save_dir = os.path.join(save_dir, fold)
    os.makedirs(fold_save_dir)

    # Save variables
    hists_path = os.path.join(fold_save_dir, 'hists.npy')
    with open(hists_path, 'wb') as f:
        np.save(f, hists)
    print(f'Saved histograms with to {hists_path}')

    labels_path = os.path.join(fold_save_dir, 'labels.npy')
    with open(labels_path, 'wb') as f:
        np.save(f, labels)
    print(f'Saved labels with shape {labels.shape} to {labels_path}')

    indices_path = os.path.join(fold_save_dir, 'indices.npy')
    with open(indices_path, 'wb') as f:
        np.save(f, indices)
    print(f'Saved indices with shape {indices.shape} to {indices_path}')

In [14]:
save_dir = os.path.join(data_dir, 'hists', fold_config, model_fold) #'/cephyr/NOBACKUP/groups/globalpoverty1/data/hists/incountry'

for fold in 'ABCDE':
    fold_indices = cv_indices[fold]['test']
    fold_files = tfrecord_files[fold_indices]

    ds = data_handling.get_inference_dataset(fold_files, batch_size=0, 
                        band_stats=band_stats, labeled=True)
    
    hists, labels, sample_indices = get_hists(ds)
    
    save_fold_hists(save_dir, fold, hists, labels, sample_indices)

Saved histograms with to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/A/hists.npy
Saved labels with shape (100, 1) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/A/labels.npy
Saved indices with shape (100,) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/A/indices.npy
Saved histograms with to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/B/hists.npy
Saved labels with shape (100, 1) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/B/labels.npy
Saved indices with shape (100,) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/B/indices.npy
Saved histograms with to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/C/hists.npy
Saved labels with shape (100, 1) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/C/labels.npy
Saved indices with shape (100,) to /mimer/NOBACKUP/groups/globalpoverty1/data/hists/incountry/A/C/indices.npy
Saved histograms with to /mimer/NOBACKUP/group

In [29]:
# ds = data_handling.get_inference_dataset(tfrecord_files, batch_size=0, normalize=False, labeled=True)

fold = 'A'
fold_indices = cv_indices[fold]['test']
fold_files = tfrecord_files[fold_indices]

ds = data_handling.get_inference_dataset(fold_files, batch_size=0, 
                    band_stats=band_stats, labeled=True)

In [53]:
band_bin_edges = np.concatenate([
    [-1e5],
    np.arange(0, 1, 0.01),
    [1e5]
])

def get_hist(img, label):
    bands = tf.reshape(img, (224 * 224, 8)).numpy()
    img_hist = np.apply_along_axis(lambda band: np.histogram(band, bins=band_bin_edges)[0], 0, bands).T
    return img_hist, label.numpy()

img_hists = []
labels = []
sample_indices = []

for x, y in ds.take(100):
    i = x['frame_index']
    img = x['model_input'][i]
    img_hist, label = get_hist(img, y)
    img_hists.append(img_hist)
    labels.append(label)
    sample_indices.append(x['sample_index'])

In [54]:
img_hists = np.stack(img_hists)
labels = np.asarray(labels)
sample_indices = np.asarray(sample_indices)

In [57]:
tf.math.reduce_mean(img_hists, axis=0)

<tf.Tensor: shape=(8, 101), dtype=int64, numpy=
array([[    0,   249,  7713, 20993, 11282,  6546,  2145,   619,   450,
          140,    28,     3,     1,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [    0,     0,   434,  6997, 18628,  8944,  6138,  4988,  2347,
          586,   375,   270,   221,   169,    44,    23,     5,     0,
      

In [50]:
def save_fold_hists(save_dir, fold, hists, labels, indices):
    # Create save_dir if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Create dir for fold
    fold_save_dir = os.path.join(save_dir, fold)
    os.makedirs(fold_save_dir)

    # Save variables
    hists_path = os.path.join(fold_save_dir, 'hists.npy')
    with open(hists_path, 'wb') as f:
        np.save(f, hists)
    print(f'Saved histograms with to {hists_path}')

    labels_path = os.path.join(fold_save_dir, 'labels.npy')
    with open(labels_path, 'wb') as f:
        np.save(f, labels)
    print(f'Saved labels with shape {labels.shape} to {labels_path}')

    indices_path = os.path.join(fold_save_dir, 'indices.npy')
    with open(indices_path, 'wb') as f:
        np.save(f, indices)
    print(f'Saved indices with shape {indices.shape} to {indices_path}')

[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=int32, numpy=3>,
 <tf.Tensor: shape=(), dtype=int32, numpy=9>,
 <tf.Tensor: shape=(), dtype=int32, numpy=16>]

In [45]:
img_hist

array([[    0,     0,     0, 15026, 32290,  2846,    14,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [    0,     0,     0,     7, 12534, 30997,  5841,   790,     7,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     

In [None]:
# Get and save features for all folds
for fold in eval_folds:
    print(f'Processing fold {fold}...')
    fold_indices = cv_indices[fold]['test']
    fold_files = tfrecord_files[fold_indices]
    fold_ds = data_handling.get_inference_dataset(fold_files, batch_size=16, 
                band_stats=band_stats, labeled=True)
    if single_frame_model:
        fold_ds = mask_single_frame_ds(fold_ds)

    features, labels, weights, sample_indices = get_fold_features(feat_model, fold_ds, single_frame_model, ten_frame_model)
    save_fold_features(save_dir, fold, features, labels, weights, sample_indices)