Run training jobs on the cloud
===

- RV pipeline
- AWS SageMaker

In [None]:
from os.path import join
from pathlib import Path

import albumentations as A
from sklearn.model_selection import KFold

from rastervision.pipeline.file_system.utils import (json_to_file, list_paths)
from rastervision.core.data import (
    ClassConfig, ClassInferenceTransformerConfig, DatasetConfig,
    GeoJSONVectorSourceConfig, MultiRasterSourceConfig, RasterioSourceConfig,
    RasterizedSourceConfig, RasterizerConfig, SceneConfig,
    SemanticSegmentationLabelSourceConfig)
from rastervision.core.rv_pipeline import (
    SemanticSegmentationConfig, SemanticSegmentationPredictOptions,
    WindowSamplingConfig, WindowSamplingMethod)
from rastervision.pytorch_learner import (
    ExternalModuleConfig, SemanticSegmentationGeoDataConfig,
    SemanticSegmentationModelConfig, PlotOptions, SolverConfig)
from rastervision.pytorch_backend import PyTorchSemanticSegmentationConfig

In [None]:
img_dir = 's3://raster-vision-ahassan/un-sandstorm/data/gibs/img/VIIRS_SNPP_CorrectedReflectance_TrueColor/'
m11_i2_i1_dir = 's3://raster-vision-ahassan/un-sandstorm/data/gibs/img/VIIRS_SNPP_CorrectedReflectance_BandsM11-I2-I1/'
m3_i3_dir = 's3://raster-vision-ahassan/un-sandstorm/data/gibs/img/VIIRS_SNPP_CorrectedReflectance_BandsM3-I3-M11/'
label_dir = 's3://raster-vision-ahassan/un-sandstorm/data/gibs/labels_geojson/'

In [None]:
class_config = ClassConfig(
    names=['background', 'dust'],
    colors=['lightgray', 'maroon'],
    null_class='background')

In [None]:
img_uris = sorted(list_paths(img_dir, ext='.tif'))
len(img_uris)

In [None]:
m11_i2_i1_uris = [
    join(m11_i2_i1_dir, f'{Path(uri).stem}.tif') for uri in img_uris
]
m3_i3_uris = [join(m3_i3_dir, f'{Path(uri).stem}.tif') for uri in img_uris]

In [None]:
label_uris = [join(label_dir, f'{Path(uri).stem}.json') for uri in img_uris]

In [None]:
def make_scene(img_uri: str,
               label_uri: str,
               class_config: ClassConfig,
               extra_raster_uris: list[str] | None = None):
    if extra_raster_uris is None:
        raster_source = RasterioSourceConfig(uris=img_uri)
    else:
        raster_uris = [img_uri] + extra_raster_uris
        raster_sources = [
            RasterioSourceConfig(uris=uri) for uri in raster_uris
        ]
        raster_source = MultiRasterSourceConfig(raster_sources=raster_sources)
    label_vector_source = GeoJSONVectorSourceConfig(
        uris=label_uri,
        transformers=[
            ClassInferenceTransformerConfig(
                default_class_id=class_config.get_class_id('background'),
                class_name_mapping=dict(
                    dust_over_land='dust', dust_over_water='dust'),
            )
        ])
    label_raster_source = RasterizedSourceConfig(
        vector_source=label_vector_source,
        rasterizer_config=RasterizerConfig(
            background_class_id=class_config.get_class_id('background'),
            all_touched=True,
        ))
    label_source = SemanticSegmentationLabelSourceConfig(
        raster_source=label_raster_source)
    scene = SceneConfig(
        id=Path(img_uri).stem,
        raster_source=raster_source,
        label_source=label_source,
    )
    return scene


In [None]:
scenes = [
    make_scene(
        img_uri,
        label_uri,
        class_config,
        extra_raster_uris=[m11_i2_i1_uri, m3_i3_uri])
    for img_uri, label_uri, m11_i2_i1_uri, m3_i3_uri in zip(
        img_uris, label_uris, m11_i2_i1_uris, m3_i3_uris)
]

In [None]:
aug_transform = A.Compose([
    A.Flip(),
    A.ShiftScaleRotate(),
])


def make_pipeline(out_uri: str, scene_dataset: 'DatasetConfig',
                  class_config: 'ClassConfig', chip_sz: int, img_sz: int,
                  num_channels: int, channel_display_groups: dict,
                  solver: 'SolverConfig'):

    window_sampling_opts = {}
    # set window configs for training scenes
    for s in scene_dataset.train_scenes:
        window_sampling_opts[s.id] = WindowSamplingConfig(
            method=WindowSamplingMethod.random,
            size_lims=(chip_sz, chip_sz + 1),
            size=img_sz,
            max_windows=16,
            padding=0,
        )

    # set window configs for validation scenes
    for s in scene_dataset.validation_scenes:
        window_sampling_opts[s.id] = WindowSamplingConfig(
            method=WindowSamplingMethod.sliding,
            size=chip_sz,
            stride=(chip_sz // 2))

    data = SemanticSegmentationGeoDataConfig(
        scene_dataset=scene_dataset,
        sampling=window_sampling_opts,
        img_sz=img_sz,
        img_channels=num_channels,
        num_workers=4,
        aug_transform=A.to_dict(aug_transform),
        plot_options=PlotOptions(
            channel_display_groups=channel_display_groups))

    model = SemanticSegmentationModelConfig(
        external_def=ExternalModuleConfig(
            github_repo='AdeelH/pytorch-fpn:0.3',
            name='fpn',
            entrypoint='make_fpn_resnet',
            entrypoint_kwargs={
                'name': 'resnet18',
                'fpn_type': 'panoptic',
                'num_classes': len(class_config),
                'fpn_channels': 128,
                'in_channels': num_channels,
                'out_size': (img_sz, img_sz),
            }))

    backend = PyTorchSemanticSegmentationConfig(
        data=data,
        model=model,
        solver=solver,
        log_tensorboard=False,
        run_tensorboard=False,
    )

    predict_options = SemanticSegmentationPredictOptions(chip_sz=chip_sz)

    pipeline = SemanticSegmentationConfig(
        root_uri=out_uri,
        dataset=scene_dataset,
        backend=backend,
        predict_options=predict_options)

    return pipeline

In [None]:
# out_dir = 's3://raster-vision-ahassan/un-sandstorm/out/2022-04-26_train_cv_w2_m11i2i1'
out_dir = 's3://raster-vision-ahassan/un-sandstorm/out/2022-04-26_train_cv_w2_m11i2i1_m3i3'

In [None]:
kf = KFold(n_splits=2)

split_pipelines = [None] * 2
for split_num, (train_inds, val_inds) in enumerate(kf.split(img_uris)):
    scene_dataset = DatasetConfig(
        class_config=class_config,
        train_scenes=[scenes[i] for i in train_inds],
        validation_scenes=[scenes[i] for i in val_inds],
    )
    split_out_dir = join(out_dir, f'split-{split_num}')
    split_pipelines[split_num] = make_pipeline(
        out_uri=split_out_dir,
        scene_dataset=scene_dataset,
        class_config=class_config,
        chip_sz=512,
        img_sz=256,
        num_channels=8,
        channel_display_groups={
            'RGB': [0, 1, 2],
            'M11-I2-I1': [3, 4, 5],
            'M3-I3': [6, 7]
        },
        solver=SolverConfig(batch_sz=16, lr=1e-4),
    )

In [None]:
pipelines_json_path = join('data', 'train', Path(out_dir).stem, 'configs.json')
pipelines_json_path

In [None]:
json_to_file([p.dict() for p in split_pipelines], pipelines_json_path)

---

In [None]:
!rastervision run sagemaker {pipelines_json_path} train --pipeline-run-name "cv-w2-m11i2i1"

In [None]:
!aws sagemaker list-training-jobs

---