In [None]:
!pip install transformers datasets evaluate albumentations torchinfo torchmetrics accelerate timm wandb peft

In [None]:
#natten is needed for dinat checkpoints
!pip3 install natten -f https://shi-labs.com/natten/wheels/cu113/torch1.10.1/index.html --quiet

In [None]:
import datasets, transformers, evaluate, accelerate, natten
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForUniversalSegmentation, OneFormerForUniversalSegmentation
from transformers import Trainer, TrainingArguments
from huggingface_hub import hf_hub_download

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from torch.optim import AdamW
from tqdm.auto import tqdm
import torchinfo, torchmetrics

from pathlib import Path
import requests
import zipfile
import json

import numpy as np
import matplotlib.pyplot as plt

import wandb
from huggingface_hub import notebook_login

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
wandb.login()

In [None]:
notebook_login()

In [None]:
#Loading dataset
railsem_ds = load_dataset("BhavanaMalla/railsem19-semantic-split355")

#Label creation
config_file = hf_hub_download(repo_id="BhavanaMalla/railsem19-semantic-split355",
                filename="rs19-config.json",
                repo_type="dataset",
                local_dir="/content")

with open(config_file, "r") as f:
    config_json = json.load(f)

json_labels = config_json["labels"]

extract_labels = []
readable_labels = []
color_palette = []
for label in json_labels:
    extract_labels.append(label["name"])
    color_palette.append(label["color"])
    readable_labels.append(label["readable"])

id2label = {idx: label for idx, label in enumerate(extract_labels)}
label2id = {label: idx for idx, label in enumerate(extract_labels)}
labels = extract_labels

In [None]:
railsem_ds["train"][0].keys(), railsem_ds["train"].features, railsem_ds["train"][0]["image"].mode

(dict_keys(['image', 'semantic_mask_label', 'json']),
 {'image': Image(decode=True, id=None),
  'semantic_mask_label': Image(decode=True, id=None),
  'json': {'frame': Value(dtype='string', id=None),
   'imgHeight': Value(dtype='int64', id=None),
   'imgWidth': Value(dtype='int64', id=None),
   'objects': [{'boundingbox': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
     'label': Value(dtype='string', id=None),
     'polygon': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
     'polyline': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
     'polyline-pair': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None)}]}},
 'RGB')

In [None]:
#checkpoint = "shi-labs/oneformer_coco_swin_large"
#checkpoint = "shi-labs/oneformer_cityscapes_swin_large"
#checkpoint = "shi-labs/oneformer_ade20k_swin_large"
#checkpoint = "shi-labs/oneformer_coco_dinat_large"
#checkpoint = "shi-labs/oneformer_cityscapes_dinat_large"
#checkpoint = "shi-labs/oneformer_ade20k_dinat_large"
checkpoint = "shi-labs/oneformer_ade20k_swin_tiny"

oneformer_coco_swin = AutoModelForUniversalSegmentation.from_pretrained(checkpoint,
                                                                        is_training=True,
                                                                        id2label=id2label,
                                                                        ignore_mismatched_sizes=True)
coco_swin_processor = AutoProcessor.from_pretrained(checkpoint, do_reduce_labels=False, do_rescale=False, do_resize=False, do_normalize=False)
coco_swin_processor.image_processor.num_text = oneformer_coco_swin.config.num_queries - oneformer_coco_swin.config.text_encoder_n_ctx


train_dataset = railsem_ds["train"]
val_dataset = railsem_ds["validation"]

#jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
# #IMAGENET
# MEAN = np.array([0.485, 0.456, 0.406])
# STD = np.array([0.229, 0.224, 0.225])
#ADE20K
MEAN = np.array([123.675, 116.280, 103.530]) / 255
STD = np.array([58.395, 57.120, 57.375]) / 255

train_transform = A.Compose([
    A.LongestMaxSize(max_size=1333),
    A.RandomCrop(width=512, height=512),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=MEAN, std=STD),
])

test_transform = A.Compose([
    A.Resize(width=512, height=512),
    A.Normalize(mean=MEAN, std=STD),
])


def train_transforms(batch):
    # images = [jitter(x) for x in example_batch['image']]
    # images = [x for x in batch['image']]
    # segmentation_maps = [np.array(x) for x in batch['semantic_mask_label']]

    transformed_images = []
    transformed_segmentation_maps = []

    for image, segmentation_map in zip(batch['image'], batch['semantic_mask_label']):
        image_np = np.array(image)
        segmentation_map_np = np.array(segmentation_map)
        transformed = train_transform(image=image_np, mask=segmentation_map_np)

        # convert the transformed image to C, H, W
        transformed_image = transformed['image'].transpose(2,0,1)
        transformed_images.append(transformed_image)
        transformed_segmentation_maps.append(transformed['mask'])

    inputs = coco_swin_processor(images=transformed_images,
                                 task_inputs=["semantic"] * len(transformed_images),
                                 segmentation_maps=transformed_segmentation_maps,
                                 return_tensors="pt")
    return inputs


def val_transforms(batch):
    # images = [x for x in transformed['image']]
    # segmentation_maps = [np.array(x) for x in transformed['semantic_mask_label']]

    transformed_images = []
    transformed_segmentation_maps = []

    for image, segmentation_map in zip(batch['image'], batch['semantic_mask_label']):
        image_np = np.array(image)
        segmentation_map_np = np.array(segmentation_map)
        transformed = train_transform(image=image_np, mask=segmentation_map_np)

         # convert the transformed image to C, H, W
        transformed_image = transformed['image'].transpose(2, 0, 1)
        transformed_images.append(transformed_image)
        transformed_segmentation_maps.append(transformed['mask'])

    inputs = coco_swin_processor(transformed_images,
                                 task_inputs=["semantic"] * len(transformed_images),
                                 segmentation_maps=transformed_segmentation_maps,
                                 return_tensors="pt")
    return inputs


# Set transforms
train_dataset.set_transform(train_transforms)
val_dataset.set_transform(val_transforms)

def collate_fn(batch): #List[Dict]
    batch_dict = {}
    pixel_values = torch.stack([example["pixel_values"] for example in batch])
    pixel_mask = torch.stack([example["pixel_mask"] for example in batch])
    class_labels = [example["class_labels"] for example in batch]
    mask_labels = [example["mask_labels"] for example in batch]
    text_inputs = torch.stack([example["text_inputs"] for example in batch])
    task_inputs = torch.stack([example["task_inputs"] for example in batch])
    batch_dict.update({"pixel_values": pixel_values,
                       "pixel_mask": pixel_mask,
                       "class_labels": class_labels,
                       "mask_labels": mask_labels,
                       "text_inputs": text_inputs,
                       "task_inputs": task_inputs})
    return batch_dict



Some weights of OneFormerForUniversalSegmentation were not initialized from the model checkpoint at shi-labs/oneformer_ade20k_swin_tiny and are newly initialized: ['model.text_mapper.text_encoder.transformer.layers.5.self_attn.out_proj.bias', 'model.text_mapper.text_encoder.transformer.layers.5.mlp.fc2.bias', 'model.text_mapper.text_encoder.transformer.layers.1.self_attn.out_proj.weight', 'model.text_mapper.text_encoder.transformer.layers.0.self_attn.out_proj.weight', 'model.text_mapper.text_encoder.transformer.layers.4.mlp.fc2.bias', 'model.text_mapper.text_encoder.transformer.layers.1.mlp.fc1.bias', 'model.text_mapper.text_encoder.transformer.layers.1.self_attn.in_proj_bias', 'model.text_mapper.text_encoder.positional_embedding', 'model.text_mapper.text_encoder.transformer.layers.0.self_attn.in_proj_weight', 'model.text_mapper.text_encoder.transformer.layers.2.mlp.fc2.weight', 'model.text_mapper.text_encoder.transformer.layers.0.layer_norm2.weight', 'model.text_mapper.text_encoder.tr

In [None]:
oneformer_coco_swin.config.to_dict().keys()

dict_keys(['backbone_config', 'ignore_value', 'num_queries', 'no_object_weight', 'class_weight', 'mask_weight', 'dice_weight', 'contrastive_weight', 'contrastive_temperature', 'train_num_points', 'oversample_ratio', 'importance_sample_ratio', 'init_std', 'init_xavier_std', 'layer_norm_eps', 'is_training', 'use_auxiliary_loss', 'output_auxiliary_logits', 'strides', 'task_seq_len', 'text_encoder_width', 'text_encoder_context_length', 'text_encoder_num_layers', 'text_encoder_vocab_size', 'text_encoder_proj_layers', 'text_encoder_n_ctx', 'conv_dim', 'mask_dim', 'hidden_dim', 'encoder_feedforward_dim', 'norm', 'encoder_layers', 'decoder_layers', 'use_task_norm', 'num_attention_heads', 'dropout', 'dim_feedforward', 'pre_norm', 'enforce_input_proj', 'query_dec_layers', 'common_stride', 'num_hidden_layers', 'return_dict', 'output_hidden_states', 'output_attentions', 'torchscript', 'torch_dtype', 'use_bfloat16', 'tf_legacy_loss', 'pruned_heads', 'tie_word_embeddings', 'is_encoder_decoder', 'is_

In [None]:
 oneformer_coco_swin.config.num_labels, oneformer_coco_swin.config.id2label, oneformer_coco_swin.config.num_classes

(19,
 {0: 'road',
  1: 'sidewalk',
  2: 'construction',
  3: 'tram-track',
  4: 'fence',
  5: 'pole',
  6: 'traffic-light',
  7: 'traffic-sign',
  8: 'vegetation',
  9: 'terrain',
  10: 'sky',
  11: 'human',
  12: 'rail-track',
  13: 'car',
  14: 'truck',
  15: 'trackbed',
  16: 'on-rails',
  17: 'rail-raised',
  18: 'rail-embedded'},
 150)

In [None]:
oneformer_coco_swin

In [None]:
coco_swin_processor.image_processor

OneFormerImageProcessor {
  "_max_size": 1333,
  "class_info_file": "coco_panoptic.json",
  "do_normalize": false,
  "do_reduce_labels": false,
  "do_rescale": false,
  "do_resize": false,
  "ignore_index": 255,
  "image_mean": [
    0.48500001430511475,
    0.4560000002384186,
    0.4059999883174896
  ],
  "image_processor_type": "OneFormerImageProcessor",
  "image_std": [
    0.2290000021457672,
    0.2239999920129776,
    0.22499999403953552
  ],
  "metadata": {
    "0": "person",
    "1": "bicycle",
    "10": "fire hydrant",
    "100": "road",
    "101": "roof",
    "102": "sand",
    "103": "sea",
    "104": "shelf",
    "105": "snow",
    "106": "stairs",
    "107": "tent",
    "108": "towel",
    "109": "wall-brick",
    "11": "stop sign",
    "110": "wall-stone",
    "111": "wall-tile",
    "112": "wall-wood",
    "113": "water-other",
    "114": "window-blind",
    "115": "window-other",
    "116": "tree-merged",
    "117": "fence-merged",
    "118": "ceiling-merged",
    "119

In [None]:
coco_swin_processor.image_processor.num_text, coco_swin_processor.image_processor.num_labels

(134, 133, 133)

In [None]:
sample = train_dataset[0]
sample.keys(), sample["pixel_values"].shape, sample["pixel_mask"].shape, sample["mask_labels"].shape, sample["class_labels"].shape, sample["text_inputs"].shape, sample["task_inputs"].shape

(dict_keys(['pixel_values', 'pixel_mask', 'mask_labels', 'class_labels', 'text_inputs', 'task_inputs']),
 torch.Size([3, 512, 512]),
 torch.Size([512, 512]),
 torch.Size([11, 512, 512]),
 torch.Size([11]),
 torch.Size([134, 77]),
 torch.Size([77]))

In [None]:
import numpy as np
(sample["pixel_values"].max(), sample["pixel_values"].min()), np.unique(sample["pixel_mask"]), np.unique(sample["mask_labels"]), np.unique(sample["class_labels"]), np.unique(sample["text_inputs"]), np.unique(sample["task_inputs"])

((tensor(2.6400), tensor(-2.1008)),
 array([1]),
 array([0., 1.], dtype=float32),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 11, 18]),
 array([    0,   320,   593,  1125,  1550,  1615,  1691,  2292,  2533,
         2840,  3231,  4440,  4629,  9629, 10297, 11652, 16451, 29119,
        49406, 49407]),
 array([    0,   518,   533,  1550, 10549, 29119, 49406, 49407]))

In [None]:
#Training settings
device = "cuda" if torch.cuda.is_available() else "cpu"

metric = evaluate.load("mean_iou")

training_args = TrainingArguments(
    "rail19_semantic_seg",
    learning_rate=5e-5,
    remove_unused_columns=False,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=2,
    eval_steps=2,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    # fp16=True,
    # push_to_hub=True,
    # hub_model_id=hub_model_id,
    # hub_strategy="end",
)

#Metrics
def compute_metrics(eval_pred):
    print("inside met")
    with torch.no_grad():
        logits, labels = eval_pred
        print(logits.shape)
        print(labels.shape)
        logits_tensor = torch.from_numpy(logits)
        # scale the logits to the size of the label
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()

        metrics = metric._compute(
                predictions=pred_labels,
                references=labels,
                num_labels=len(id2label),
                ignore_index=255,
                reduce_labels=coco_swin_processor.do_reduce_labels,
        )

    # add per category metrics as individual key-value pairs
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})

    return metrics

#trainer
trainer = Trainer(
    model=oneformer_coco_swin,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [None]:
#Train
trainer.train()

RuntimeError: output with shape [] doesn't match the broadcast shape [1]

In [None]:
#Override Trainer
class MyTrainer(Trainer):

    def training_step(self, model, inputs):
        print("inside training step")
        print(inputs.keys())
        print(inputs["pixel_values"].shape)

        loss = super().training_step(model, inputs)

        print(loss, loss.shape) #tensor([110.4585])
        return loss

    def _inner_training_loop(self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None):
        print("Inside mine")
        return super()._inner_training_loop(batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)



training_args = TrainingArguments(
    "rail19_semantic_seg",
    learning_rate=5e-5,
    remove_unused_columns=False,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_steps=1,
)

#trainer
mytrainer = MyTrainer(
    model=oneformer_coco_swin,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=collate_fn
)

#training
mytrainer.train()

Inside mine
inside training step
dict_keys(['pixel_values', 'pixel_mask', 'class_labels', 'mask_labels', 'text_inputs', 'task_inputs'])
torch.Size([2, 3, 512, 512])
tensor([104.6475]) torch.Size([1])


RuntimeError: output with shape [] doesn't match the broadcast shape [1]