In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
from os.path import expanduser

os.environ["TMPDIR"] = "/data/tmp/rv"

%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
load_dotenv()

from google.cloud import storage
from project_config import GCP_PROJECT_NAME, DATASET_JSON_PATH

gcp_client = storage.Client(project=GCP_PROJECT_NAME)



In [2]:
import numpy as np
from rastervision.core.data import RasterioSource
from utils.schemas import ObservationPointer
from utils.data_management import observation_factory

## Proportion between mine and non-mine areas

In [3]:
from experiment_configs.configs import *
config = satmae_large_config


In [4]:
from torch.utils.data import ConcatDataset
import json
from utils.rastervision_pipeline import observation_to_scene, scene_to_training_ds, scene_to_validation_ds, scene_to_inference_ds
from utils.data_management import observation_factory, characterize_dataset
import random

#set the seed
random.seed(13)

# get the current working directory
root_dir = os.getcwd()

# define the relative path to the dataset JSON file
json_rel_path = '../' + DATASET_JSON_PATH

# combine the root directory with the relative path
json_abs_path = os.path.join(root_dir, json_rel_path)

dataset_json = json.load(open(json_abs_path, 'r'))
all_observations = observation_factory(dataset_json)

#find the highest cluster id
max_cluster_id = max([observation['cluster_id'] for observation in dataset_json])


# Randomly split the data into training and validation
val_split = random.randint(0, max_cluster_id+1)

training_scenes = []
validation_scenes = []

scene_sizes = []
scalar = 100 / 1e6 # 10m x 10m / 10,000 m^2 = Sq km

for observation in all_observations:
    if observation.cluster_id == val_split: 
        validation_scenes.append(observation_to_scene(config, observation))
        h,w = validation_scenes[-1].label_source.get_label_arr().shape
        id = validation_scenes[-1].id
        scene_dict = {'id': id, 'size' : h*w*scalar}
        scene_sizes.append(scene_dict)

        print(validation_scenes[-1].id, h, w)
    else:
        training_scenes.append(observation_to_scene(config, observation))
        h,w = training_scenes[-1].label_source.get_label_arr().shape
        id = training_scenes[-1].id
        scene_dict = {'id': id, 'size' : h*w*scalar}
        scene_sizes.append(scene_dict)


    


training_datasets = [
    scene_to_training_ds(config, scene) for scene in training_scenes #random window sampling happens here
]
validation_datasets = [
    # scene_to_validation_ds(config, scene) for scene in validation_scenes
    scene_to_inference_ds(config, scene, full_image=False, stride=int(config.tile_size/2)) for scene in validation_scenes # better performance with this
]

train_dataset_merged = ConcatDataset(training_datasets)
val_dataset_merged = ConcatDataset(validation_datasets)

print('Validation split cluster_id:', val_split)
print ('Training dataset size: {:4d} images | Number of observations: {:}'.format(len(train_dataset_merged), len(training_scenes)))
print ('Testing dataset size: {:4d}  images | Number of observations: {:}'.format(len(val_dataset_merged), len(validation_scenes)))

mine_percentage_aoi = characterize_dataset(training_scenes, validation_scenes)

print(scene_sizes)

2024-02-18 10:07:05:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2.tif to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2.tif...
2024-02-18 10:07:05:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/annotations/Kathajodi_Cuttack_85-85_20-44_2022-05-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/annotations/Kathajodi_Cuttack_85-85_20-44_2022-05-01_annotations.geojson...
2024-02-18 10:07:05:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_media

Ken_Banda_80-35_25-68_2022-06-01 451 572


2024-02-18 10:07:07:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Ken_Banda_80-35_25-68_median/rivers/Ken_Banda_80-35_25-68_rivers_1000m.geojson.
2024-02-18 10:07:07:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Ken_Banda_80-35_25-68_median/rivers/Ken_Banda_80-35_25-68_rivers_1000m.geojson and assuming EPSG:4326 instead.
2024-02-18 10:07:07:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/s2/Narmada_Sehore_77-32_22-56_2022-01-01_s2.tif to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/s2/Narmada_Sehore_77-32_22-56_2022-01-01_s2.tif...
2024-02-18 10:07:07:rast

Ken_Banda_80-35_25-68_2022-01-01 451 572


2024-02-18 10:07:07:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/rivers/Narmada_Sehore_77-32_22-56_rivers_1000m.geojson and assuming EPSG:4326 instead.
2024-02-18 10:07:07:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/s2/Narmada_Sehore_77-32_22-56_2022-10-01_s2.tif to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/s2/Narmada_Sehore_77-32_22-56_2022-10-01_s2.tif...
2024-02-18 10:07:07:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/annotations/Narmada_Sehore_77-32_22-56_2022-10-01_annotations.geojson to /data/tmp

 68%|######7   | 120M/177M [00:05<00:02, 24.6MB/s]

2024-02-18 10:07:18:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/annotations/Sone_Rohtas_84-21_24-91_2022-02-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/annotations/Sone_Rohtas_84-21_24-91_2022-02-01_annotations.geojson...
2024-02-18 10:07:18:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/rivers/Sone_Rohtas_84-21_24-91_rivers_1000m.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/rivers/Sone_Rohtas_84-21_24-91_rivers_1000m.geojson...
2024-02-18 10:07:18:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.g

 64%|######3   | 112M/176M [00:05<00:03, 21.5MB/s]

2024-02-18 10:07:27:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/annotations/Sone_Rohtas_84-21_24-91_2022-05-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/annotations/Sone_Rohtas_84-21_24-91_2022-05-01_annotations.geojson...
2024-02-18 10:07:27:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/rivers/Sone_Rohtas_84-21_24-91_rivers_1000m.geojson.
2024-02-18 10:07:27:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Sone_Rohtas_84-21_24-91_median/rivers/Sone_Rohtas_84-21_24-91_rivers_1000m.geojson and assuming EPS

Chambal_More_77-92_26-66_2023-01-01 189 541


2024-02-18 10:07:38:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Chambal_More_77-92_26-66_median/annotations/Chambal_More_77-92_26-66_2023-06-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Chambal_More_77-92_26-66_median/annotations/Chambal_More_77-92_26-66_2023-06-01_annotations.geojson...
2024-02-18 10:07:38:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Chambal_More_77-92_26-66_median/rivers/Chambal_More_77-92_26-66_rivers_1000m.geojson.
2024-02-18 10:07:38:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Chambal_More_77-92_26-66_median/rivers/Chambal_More_77-92_26-66_rivers_1000m.geojson and assu

Chambal_More_77-92_26-66_2023-06-01 189 541


2024-02-18 10:07:42:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Damodar_PurbaBardhaman_87-73_23-24_median/annotations/Damodar_PurbaBardhaman_87-73_23-24_2022-11-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Damodar_PurbaBardhaman_87-73_23-24_median/annotations/Damodar_PurbaBardhaman_87-73_23-24_2022-11-01_annotations.geojson...
2024-02-18 10:07:42:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Damodar_PurbaBardhaman_87-73_23-24_median/rivers/Damodar_PurbaBardhaman_87-73_23-24_rivers_1000m.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Damodar_PurbaBardhaman_87-73_23-24_median/rivers/Damodar_PurbaBardhaman_87-73_23-24_rivers_1000m.geojson...
2024-02-18 10:07:43:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 

 91%|#########1| 72.1M/79.1M [00:05<00:00, 14.5MB/s]

2024-02-18 10:08:10:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/annotations/Betwa_Jalaun_79-49_25-84_2022-10-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/annotations/Betwa_Jalaun_79-49_25-84_2022-10-01_annotations.geojson...
2024-02-18 10:08:10:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/rivers/Betwa_Jalaun_79-49_25-84_rivers_1000m.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/rivers/Betwa_Jalaun_79-49_25-84_rivers_1000m.geojson...
2024-02-18 10:08:10:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://s

Betwa_Jalaun_79-49_25-84_2022-10-01 1425 2072


2024-02-18 10:08:14:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/annotations/Betwa_Jalaun_79-49_25-84_2023-05-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/annotations/Betwa_Jalaun_79-49_25-84_2023-05-01_annotations.geojson...
2024-02-18 10:08:14:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/rivers/Betwa_Jalaun_79-49_25-84_rivers_1000m.geojson.
2024-02-18 10:08:14:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-49_25-84_median/rivers/Betwa_Jalaun_79-49_25-84_rivers_1000m.geojson and assu

Betwa_Jalaun_79-49_25-84_2023-05-01 1425 2072


2024-02-18 10:08:18:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/annotations/Betwa_Jalaun_79-79_25-89_2022-10-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/annotations/Betwa_Jalaun_79-79_25-89_2022-10-01_annotations.geojson...
2024-02-18 10:08:18:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/rivers/Betwa_Jalaun_79-79_25-89_rivers_1000m.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/rivers/Betwa_Jalaun_79-79_25-89_rivers_1000m.geojson...
2024-02-18 10:08:18:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://s

Betwa_Jalaun_79-79_25-89_2022-10-01 910 4394


2024-02-18 10:08:23:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/annotations/Betwa_Jalaun_79-79_25-89_2023-05-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/annotations/Betwa_Jalaun_79-79_25-89_2023-05-01_annotations.geojson...
2024-02-18 10:08:23:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/rivers/Betwa_Jalaun_79-79_25-89_rivers_1000m.geojson.
2024-02-18 10:08:23:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/rivers/Betwa_Jalaun_79-79_25-89_rivers_1000m.geojson and assu

Betwa_Jalaun_79-79_25-89_2023-05-01 910 4394


2024-02-18 10:08:26:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Mahanadi_Angul_84-52_20-71_median/annotations/Mahanadi_Angul_84-52_20-71_2021-11-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Mahanadi_Angul_84-52_20-71_median/annotations/Mahanadi_Angul_84-52_20-71_2021-11-01_annotations.geojson...
2024-02-18 10:08:26:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Mahanadi_Angul_84-52_20-71_median/rivers/Mahanadi_Angul_84-52_20-71_rivers_1000m.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Mahanadi_Angul_84-52_20-71_median/rivers/Mahanadi_Angul_84-52_20-71_rivers_1000m.geojson...
2024-02-18 10:08:27:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specif

 82%|########1 | 108M/132M [00:05<00:01, 22.6MB/s]

2024-02-18 10:09:17:rastervision.pipeline.file_system.utils: INFO - Downloading https://storage.googleapis.com/sand_mining_median/labels/Godavari_BhadradriKothagudem_80-79_18-04_median/annotations/Godavari_BhadradriKothagudem_80-79_18-04_2022-05-01_annotations.geojson to /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Godavari_BhadradriKothagudem_80-79_18-04_median/annotations/Godavari_BhadradriKothagudem_80-79_18-04_2022-05-01_annotations.geojson...
2024-02-18 10:09:18:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Godavari_BhadradriKothagudem_80-79_18-04_median/rivers/Godavari_BhadradriKothagudem_80-79_18-04_rivers_1000m.geojson.
2024-02-18 10:09:18:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labe

Validation split cluster_id: 4
Training dataset size: 3088 images | Number of observations: 72
Testing dataset size: 1456  images | Number of observations: 8
Total dataset has 2.93%  mining area.
Training dataset has 3.05%  mining area.
Validation dataset has 2.15%  mining area.
Within AOIs, total dataset has 6.72%  mining area.
Outside AOIs, total dataset has 0.06%  mining area.

The median percentage of mine in an observation is 3.26%
The median number of mine pixels in an observation is 38059

The median number pixels in an observation is 1301008
[{'id': 'Kathajodi_Cuttack_85-85_20-44_2022-05-01', 'size': 105.786}, {'id': 'Kathajodi_Cuttack_85-85_20-44_2022-02-01', 'size': 105.786}, {'id': 'Ken_Banda_80-35_25-68_2022-06-01', 'size': 25.7972}, {'id': 'Ken_Banda_80-35_25-68_2022-01-01', 'size': 25.7972}, {'id': 'Narmada_Sehore_77-32_22-56_2022-01-01', 'size': 20.3218}, {'id': 'Narmada_Sehore_77-32_22-56_2022-10-01', 'size': 20.3218}, {'id': 'Tawa_Hoshangabad_77-80_22-74_2022-04-01', '

In [6]:
def is_training(cluster_id):
    if cluster_id != val_split:
        return True


def is_validation(cluster_id):
    if cluster_id == val_split:
        return True

## Flag all small scenes

In [5]:
sq_area_threshold = 50.

for scene in scene_sizes:
    if scene['size'] <= sq_area_threshold:
        print (scene)

{'id': 'Ken_Banda_80-35_25-68_2022-06-01', 'size': 25.7972}
{'id': 'Ken_Banda_80-35_25-68_2022-01-01', 'size': 25.7972}
{'id': 'Narmada_Sehore_77-32_22-56_2022-01-01', 'size': 20.3218}
{'id': 'Narmada_Sehore_77-32_22-56_2022-10-01', 'size': 20.3218}
{'id': 'Sone_Rohtas_83-86_24-46_2023-01-01', 'size': 23.6676}
{'id': 'Sone_Rohtas_83-86_24-46_2023-06-01', 'size': 23.6676}
{'id': 'Ganges_Patna_85-23_25-62_2022-02-01', 'size': 41.1482}
{'id': 'Ganges_Patna_85-23_25-62_2022-05-01', 'size': 41.1482}
{'id': 'Chambal_More_77-92_26-66_2023-01-01', 'size': 10.2249}
{'id': 'Chambal_More_77-92_26-66_2023-06-01', 'size': 10.2249}
{'id': 'Mayurakshi_Birbhum_87-66_23-61_2022-10-01', 'size': 26.416}
{'id': 'Mayurakshi_Birbhum_87-66_23-61_2023-05-01', 'size': 26.416}
{'id': 'Mahananda_Jalpaiguri_88-4_26-68_2020-11-01', 'size': 22.3016}
{'id': 'Teesta_Jalpaiguri_88-6_26-84_2021-12-01', 'size': 23.8854}
{'id': 'Mahananda_Jalpaiguri_88-4_26-68_2021-05-01', 'size': 22.3016}
{'id': 'Teesta_Jalpaiguri_88-64

In [12]:
from utils.rastervision_pipeline import observation_to_scene
from experiment_configs.configs import unet_config as config
from project_config import GCP_PROJECT_NAME, DATASET_JSON_PATH
import rasterio
import json

labels_train_raveled = []
labels_val_raveled = []
label_in_aoi_raveled = []
label_outside_aoi_raveled = []


root_dir = os.getcwd()

# define the relative path to the dataset JSON file
json_rel_path = '../' + DATASET_JSON_PATH

# combine the root directory with the relative path
json_abs_path = os.path.join(root_dir, json_rel_path)

dataset_json = json.load(open(json_abs_path, 'r'))
all_observations = observation_factory(dataset_json)


for observation in all_observations:
    is_train = is_training(observation.cluster_id)
    is_val = is_validation(observation.cluster_id)
    if not is_train and not is_val:
        print(f"Ignoring {observation.name}")
        continue

    scene = observation_to_scene(config, observation)
    label_arr = scene.label_source.get_label_arr()
    label_arr_raveled = label_arr.ravel()
    mask = rasterio.features.rasterize(scene.aoi_polygons, label_arr.shape)
    mask_raveled = mask.ravel()

    label_in_aoi_raveled.append(
        label_arr_raveled[mask_raveled != 0]
    )
    label_outside_aoi_raveled.append(
        label_arr_raveled[mask_raveled == 0]
    )
    
    if is_train:
        labels_train_raveled.append(label_arr_raveled)
    else:
        labels_val_raveled.append(label_arr_raveled)

all_labels_outside_aoi = np.hstack(label_outside_aoi_raveled)
all_labels_aoi = np.hstack(label_in_aoi_raveled)
all_labels_train = np.hstack(labels_train_raveled)
all_labels_val = np.hstack(labels_val_raveled)


2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2.tif.
2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s1/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s1.tif.
2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/annotations/Kathajodi_Cuttack_85-85_20-44_2022-05-01_annotations.geojson.
2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/rivers/Kathajodi_Cuttack_85-85_20-44_r

2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/annotations/Kathajodi_Cuttack_85-85_20-44_2022-02-01_annotations.geojson.
2024-02-18 10:23:49:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/rivers/Kathajodi_Cuttack_85-85_20-44_rivers_1000m.geojson.
2024-02-18 10:23:49:rastervision.core.data.vector_source.geojson_vector_source: INFO - Ignoring CRS ({'type': 'name', 'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}}) specified in https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/rivers/Kathajodi_Cuttack_85-85_20-44_rivers_1000m.geojson and assuming EPSG:4326 instead.
2024-02-18 10:23:50:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cac

In [13]:
from project_config import CLASS_CONFIG

class_mine_id = CLASS_CONFIG.get_class_id('sandmine')
class_nonmine_id = CLASS_CONFIG.get_class_id('other')

def calc_class_proportion(labels):
    mask_mine = (labels == class_mine_id)
    mask_nonmine = (labels == class_nonmine_id)

    count_mine = np.sum(mask_mine)
    count_nonemine = np.sum(mask_nonmine)
    count_total = len(labels)

    assert count_total == count_mine + count_nonemine
    mine_percentage = count_mine/count_total * 100
    nonmine_percentage = count_nonemine/count_total * 100
    return mine_percentage, count_mine

mine_percentage, _ = calc_class_proportion(np.hstack([all_labels_train, all_labels_val]))
print(f"Total dataset has {mine_percentage:.2f}%  mining area.")

mine_percentage, _ = calc_class_proportion(all_labels_train)
print(f"Training dataset has {mine_percentage:.2f}%  mining area.")

mine_percentage, _ = calc_class_proportion(all_labels_val)
print(f"Validation dataset has {mine_percentage:.2f}%  mining area.")

mine_percentage, _ = calc_class_proportion(all_labels_aoi)
print(f"Within AOIs, total dataset has {mine_percentage:.2f}%  mining area.")

mine_percentage, _ = calc_class_proportion(all_labels_outside_aoi)
print(f"Outside AOIs, total dataset has {mine_percentage:.2f}%  mining area.")

mine_percentage_per_observation = []
n_mine_pixels_per_observation = []
labels_full_dataset_raveled = [*labels_train_raveled, *labels_val_raveled]
for labels_of_observation in labels_full_dataset_raveled:
    mine_percentage_this_observation, n_mine_pixels_this_observation = calc_class_proportion(labels_of_observation)
    mine_percentage_per_observation.append(mine_percentage_this_observation)
    n_mine_pixels_per_observation.append(n_mine_pixels_this_observation)

print()
print(f"The median percentage of mine in an observation is {np.mean(mine_percentage_per_observation):.2f}%")
print(f"The median number of mine pixels in an observation is {np.mean(n_mine_pixels_per_observation):.0f}")

print()
n_total_pixels_per_observations = [len(labels_single_observation) for labels_single_observation in labels_full_dataset_raveled]
print(f"The median number pixels in an observation is {np.mean(n_total_pixels_per_observations):.0f}")


Total dataset has 2.93%  mining area.
Training dataset has 3.05%  mining area.
Validation dataset has 2.15%  mining area.
Within AOIs, total dataset has 6.72%  mining area.
Outside AOIs, total dataset has 0.06%  mining area.

The median percentage of mine in an observation is 3.26%
The median number of mine pixels in an observation is 38059

The median number pixels in an observation is 1301008


## Mean and Std of S1 images

In [15]:
from rastervision.core.data.raster_transformer.nan_transformer import NanTransformer

all_observations = observation_factory(dataset_json)

all_observations: List[ObservationPointer] = list(all_observations)

all_vv_raveled = []
all_vh_raveled = []
for observation in all_observations:
    raster_source = RasterioSource(
        observation.uri_to_s1,
        raster_transformers=[NanTransformer()]  # replaces NaNs with 0
    )

    vv_img = raster_source.get_raw_chip(raster_source.extent)[:,:,0]
    vh_img = raster_source.get_raw_chip(raster_source.extent)[:,:,1]
    all_vv_raveled.append(vv_img.ravel())
    all_vh_raveled.append(vh_img.ravel())
    
all_vv = np.hstack(all_vv_raveled)
all_vh = np.hstack(all_vh_raveled)


2024-02-18 10:25:29:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s1/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s1.tif.
2024-02-18 10:25:30:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s1/Kathajodi_Cuttack_85-85_20-44_2022-02-01_s1.tif.
2024-02-18 10:25:30:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Ken_Banda_80-35_25-68_median/s1/Ken_Banda_80-35_25-68_2022-06-01_s1.tif.
2024-02-18 10:25:30:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Ken_Banda_80-35_25-68_median/s1/Ken_Banda_80-35_25-68_2022-01-01_s1.tif.
2024-02-18 10:25:30:rastervision.pipelin

In [16]:
print(f"VV: Mean = {np.mean(all_vv)}, Std = {np.std(all_vv)}")
print(f"VH: Mean = {np.mean(all_vh)}, Std = {np.std(all_vh)}")

VV: Mean = nan, Std = nan
VH: Mean = nan, Std = nan


## Area of observations

In [17]:
all_observations

[ObservationPointer(uri_to_s1='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s1/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s1.tif', uri_to_s2='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2.tif', uri_to_s2_l1c='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2_l1c/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2_l1c.tif', uri_to_rgb='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/rgb/Kathajodi_Cuttack_85-85_20-44_2022-05-01_rgb.tif', uri_to_annotations='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/annotations/Kathajodi_Cuttack_85-85_20-44_2022-05-01_annotations.geojson', uri_to_rivers='https://storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/rivers/Kathajodi_Cuttack_85-85_20-4

In [18]:
from utils.data_management import get_location_from_key

dataset_summary = {}

observations_per_locations = {}
for observation in all_observations:
    is_train = is_training(observation.name)
    is_val = is_validation(observation.name)
    if not is_train and not is_val:
        print(f"Ignoring {observation.name}")
        # continue

    location = get_location_from_key(observation.name)

    print(location)
    if location in observations_per_locations:
        observations_per_locations[location].append(observation)
    else:
        observations_per_locations[location] = [observation]

total_area_km2 = 0
training_area_km = 0
validation_area_km = 0
smallest_area = 9999999
largest_area = 0
for location, observation_list in observations_per_locations.items():
    is_train = is_training(observation_list[0].cluster_id)
    is_val = is_validation(observation_list[0].cluster_id)

    # To determine the patch size, we only look into the first observations.
    # We expect that all observations cover the same geographical extent.
    raster_source = RasterioSource(observation_list[0].uri_to_s2, allow_streaming=False)
    coverage_area_km2 = raster_source.shape[0] * raster_source.shape[1] / 1e4  # Each pixel covers 100m^2
    summary_of_location = {
        "Number of observations": len(observation_list),
        "Patch size": raster_source.shape[:2],
        "Coverage area [km^2]": round(coverage_area_km2, 2),
        "Split": "TRAIN" if is_train else "VAL"
    }
    dataset_summary[location] = summary_of_location
    
    total_area_km2 += coverage_area_km2
    if is_train:
        training_area_km += coverage_area_km2
    if is_val:
        validation_area_km += coverage_area_km2

    smallest_area = min(smallest_area, coverage_area_km2)
    largest_area = max(largest_area, coverage_area_km2)


print(dataset_summary)
print(f"Total of {len(observations_per_locations)} locations")
print(f"Total area is {total_area_km2} km2")
print(f"Training area is {training_area_km} km2")
print(f"Validation area is {validation_area_km} km2")
print(f"Smallest location is {smallest_area} km2")
print(f"Largest location is {largest_area} km2")


2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Kathajodi_Cuttack_85-85_20-44_median/s2/Kathajodi_Cuttack_85-85_20-44_2022-05-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Ken_Banda_80-35_25-68_median/s2/Ken_Banda_80-35_25-68_2022-06-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Narmada_Sehore_77-32_22-56_median/s2/Narmada_Sehore_77-32_22-56_2022-01-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Tawa_Hoshangabad_77-80_22-74_median/s2/Tawa_Hoshangabad_77-80_22-74_2022-04-01_s2.tif.
2024-02-18 10:26:53:rastervision

Kathajodi_Cuttack_85-85_20-44
Kathajodi_Cuttack_85-85_20-44
Ken_Banda_80-35_25-68
Ken_Banda_80-35_25-68
Narmada_Sehore_77-32_22-56
Narmada_Sehore_77-32_22-56
Tawa_Hoshangabad_77-80_22-74
Tawa_Hoshangabad_77-80_22-74
Sone_Rohtas_84-21_24-91
Sone_Rohtas_84-21_24-91
Sone_Rohtas_83-86_24-46
Sone_Rohtas_83-86_24-46
Bhargavi_Khordha_85-88_20-26
Bhargavi_Khordha_85-88_20-26
Ganges_Patna_85-23_25-62
Ganges_Patna_85-23_25-62
Ganges_Patna_85-1_25-66
Ganges_Patna_85-1_25-66
Chambal_More_77-92_26-66
Chambal_More_77-92_26-66
Damodar_PurbaBardhaman_87-73_23-24
Damodar_PurbaBardhaman_87-73_23-24
Mayurakshi_Birbhum_87-66_23-61
Mayurakshi_Birbhum_87-66_23-61
Sone_Patna_84-76_25-44
Sone_Patna_84-76_25-44
Mahananda_Jalpaiguri_88-4_26-68
Damodar_PurbaBardhaman_87-39_23-42
Teesta_Jalpaiguri_88-6_26-84
Mahananda_Jalpaiguri_88-4_26-68
Betwa_Jalaun_79-49_25-84
Betwa_Jalaun_79-49_25-84
Betwa_Jalaun_79-79_25-89
Betwa_Jalaun_79-79_25-89
Mahanadi_Angul_84-52_20-71
Mahanadi_Angul_84-52_20-71
Mahananda_UttarDinajpu

2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Mahanadi_Angul_84-52_20-71_median/s2/Mahanadi_Angul_84-52_20-71_2021-11-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Mahananda_UttarDinajpur_88-25_26-46_median/s2/Mahananda_UttarDinajpur_88-25_26-46_2020-10-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Teesta_Jalpaiguri_88-64_26-85_median/s2/Teesta_Jalpaiguri_88-64_26-85_2020-01-01_s2.tif.
2024-02-18 10:26:53:rastervision.pipeline.file_system.utils: INFO - Using cached file /data/tmp/rv/cache/http/storage.googleapis.com/sand_mining_median/labels/Gangadhar_CoochBehar_89-86_26-46_median/s2/Gangadhar_CoochBehar_89-86_26-46_2021-12-01_s2.t

{'Kathajodi_Cuttack_85-85_20-44': {'Number of observations': 2, 'Patch size': (653, 1620), 'Coverage area [km^2]': 105.79, 'Split': 'TRAIN'}, 'Ken_Banda_80-35_25-68': {'Number of observations': 2, 'Patch size': (451, 572), 'Coverage area [km^2]': 25.8, 'Split': 'VAL'}, 'Narmada_Sehore_77-32_22-56': {'Number of observations': 2, 'Patch size': (278, 731), 'Coverage area [km^2]': 20.32, 'Split': 'TRAIN'}, 'Tawa_Hoshangabad_77-80_22-74': {'Number of observations': 2, 'Patch size': (1164, 1528), 'Coverage area [km^2]': 177.86, 'Split': 'TRAIN'}, 'Sone_Rohtas_84-21_24-91': {'Number of observations': 2, 'Patch size': (2306, 2524), 'Coverage area [km^2]': 582.03, 'Split': 'TRAIN'}, 'Sone_Rohtas_83-86_24-46': {'Number of observations': 2, 'Patch size': (726, 326), 'Coverage area [km^2]': 23.67, 'Split': 'TRAIN'}, 'Bhargavi_Khordha_85-88_20-26': {'Number of observations': 2, 'Patch size': (1231, 459), 'Coverage area [km^2]': 56.5, 'Split': 'TRAIN'}, 'Ganges_Patna_85-23_25-62': {'Number of observ