In [1]:
from datasets import load_dataset

hf_dataset_identifier = "segments/sidewalk-semantic"

ds = load_dataset(hf_dataset_identifier)


  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/morten/.cache/huggingface/datasets/segments___parquet/segments--sidewalk-semantic-2-1680095ec7b2b03f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 650.78it/s]


In [2]:
ds = ds.shuffle(seed=1)
ds = ds["train"].train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]


Loading cached shuffled indices for dataset at /home/morten/.cache/huggingface/datasets/segments___parquet/segments--sidewalk-semantic-2-1680095ec7b2b03f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dc5e003bf815860f.arrow


In [3]:
import json
from huggingface_hub import hf_hub_download

repo_id = f"datasets/{hf_dataset_identifier}"
filename = "id2label.json"
id2label = json.load(open(hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(id2label)

In [4]:
from torchvision.transforms import ColorJitter
from transformers import SegformerFeatureExtractor

feature_extractor = SegformerFeatureExtractor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)



In [5]:
from transformers import SegformerForSemanticSegmentation

pretrained_model_name = "nvidia/mit-b0" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id
)


Some weights of the model checkpoint at nvidia/mit-b0 were not used when initializing SegformerForSemanticSegmentation: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.bias', 'decode_head.batch_norm.running_var', 'decode_he

In [6]:
from transformers import TrainingArguments

epochs = 50
lr = 0.00006
batch_size = 2

hub_model_id = "segformer-b0-finetuned-segments-sidewalk-2"

training_args = TrainingArguments(
    "segformer-b0-finetuned-segments-sidewalk-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    hub_strategy="end",
)


  return torch._C._cuda_getDeviceCount() > 0


In [7]:
import torch
from torch import nn
import evaluate

metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    # currently using _compute instead of compute
    # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
    metrics = metric._compute(
            predictions=pred_labels,
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
            reduce_labels=feature_extractor.reduce_labels,
        )
    
    # add per category metrics as individual key-value pairs
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
    
    return metrics


In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()


  0%|          | 1/20000 [00:02<12:50:21,  2.31s/it]

{'loss': 3.57, 'learning_rate': 5.9997e-05, 'epoch': 0.0}


  0%|          | 2/20000 [00:04<12:07:53,  2.18s/it]

{'loss': 3.525, 'learning_rate': 5.9994e-05, 'epoch': 0.01}


  0%|          | 3/20000 [00:06<11:17:57,  2.03s/it]

{'loss': 3.5377, 'learning_rate': 5.9991e-05, 'epoch': 0.01}


  0%|          | 4/20000 [00:08<11:13:18,  2.02s/it]

{'loss': 3.5269, 'learning_rate': 5.9988e-05, 'epoch': 0.01}


  0%|          | 5/20000 [00:10<10:54:58,  1.97s/it]

{'loss': 3.4482, 'learning_rate': 5.9985e-05, 'epoch': 0.01}


  0%|          | 6/20000 [00:12<10:48:47,  1.95s/it]

{'loss': 3.421, 'learning_rate': 5.9982e-05, 'epoch': 0.01}


  0%|          | 7/20000 [00:13<10:44:42,  1.93s/it]

{'loss': 3.3612, 'learning_rate': 5.9979e-05, 'epoch': 0.02}


  0%|          | 8/20000 [00:15<10:53:44,  1.96s/it]

{'loss': 3.2857, 'learning_rate': 5.9976e-05, 'epoch': 0.02}


  0%|          | 9/20000 [00:17<10:51:51,  1.96s/it]

{'loss': 3.2928, 'learning_rate': 5.9973e-05, 'epoch': 0.02}


  0%|          | 10/20000 [00:19<10:47:45,  1.94s/it]

{'loss': 3.2421, 'learning_rate': 5.9970000000000004e-05, 'epoch': 0.03}


  0%|          | 11/20000 [00:21<11:04:15,  1.99s/it]

{'loss': 3.2163, 'learning_rate': 5.9967e-05, 'epoch': 0.03}


  0%|          | 12/20000 [00:23<10:57:42,  1.97s/it]

{'loss': 3.2062, 'learning_rate': 5.9964e-05, 'epoch': 0.03}


  0%|          | 13/20000 [00:25<10:51:20,  1.96s/it]

{'loss': 3.1446, 'learning_rate': 5.9961e-05, 'epoch': 0.03}


  0%|          | 14/20000 [00:27<11:07:00,  2.00s/it]

{'loss': 3.1417, 'learning_rate': 5.9958e-05, 'epoch': 0.04}


  0%|          | 15/20000 [00:30<11:24:50,  2.06s/it]

{'loss': 3.1061, 'learning_rate': 5.9955e-05, 'epoch': 0.04}


  0%|          | 16/20000 [00:32<11:13:22,  2.02s/it]

{'loss': 3.1773, 'learning_rate': 5.9952e-05, 'epoch': 0.04}


  0%|          | 17/20000 [00:34<11:13:57,  2.02s/it]

{'loss': 3.1807, 'learning_rate': 5.9949e-05, 'epoch': 0.04}


  0%|          | 18/20000 [00:36<11:29:25,  2.07s/it]

{'loss': 2.9615, 'learning_rate': 5.9946e-05, 'epoch': 0.04}


  0%|          | 19/20000 [00:38<11:20:55,  2.04s/it]

{'loss': 2.8567, 'learning_rate': 5.9943e-05, 'epoch': 0.05}


  0%|          | 20/20000 [00:40<11:12:15,  2.02s/it]

{'loss': 3.1587, 'learning_rate': 5.994e-05, 'epoch': 0.05}


  acc = total_area_intersect / total_area_label
                                                     
  0%|          | 20/20000 [01:55<11:12:15,  2.02s/it]

{'eval_loss': 3.213449001312256, 'eval_mean_iou': 0.07917630857357273, 'eval_mean_accuracy': 0.13518397182860822, 'eval_overall_accuracy': 0.5873472740428315, 'eval_accuracy_unlabeled': nan, 'eval_accuracy_flat-road': 0.23938308227483113, 'eval_accuracy_flat-sidewalk': 0.9177352903548885, 'eval_accuracy_flat-crosswalk': 0.0, 'eval_accuracy_flat-cyclinglane': 0.0009647395477901635, 'eval_accuracy_flat-parkingdriveway': 1.0583930569415465e-05, 'eval_accuracy_flat-railtrack': nan, 'eval_accuracy_flat-curb': 6.759104561443592e-05, 'eval_accuracy_human-person': 8.245464170159995e-05, 'eval_accuracy_human-rider': 0.0, 'eval_accuracy_vehicle-car': 0.8790664822815283, 'eval_accuracy_vehicle-truck': 0.0, 'eval_accuracy_vehicle-bus': 0.0, 'eval_accuracy_vehicle-tramtrain': 0.0, 'eval_accuracy_vehicle-motorcycle': 0.0, 'eval_accuracy_vehicle-bicycle': 0.0, 'eval_accuracy_vehicle-caravan': 0.0, 'eval_accuracy_vehicle-cartrailer': 0.0, 'eval_accuracy_construction-building': 0.7988563750162302, 'eva

  0%|          | 21/20000 [01:58<138:05:26, 24.88s/it]

{'loss': 2.8781, 'learning_rate': 5.9937e-05, 'epoch': 0.05}


  0%|          | 22/20000 [02:00<100:24:56, 18.09s/it]

{'loss': 2.8355, 'learning_rate': 5.9934e-05, 'epoch': 0.06}


  0%|          | 23/20000 [02:02<74:05:29, 13.35s/it] 

{'loss': 2.8731, 'learning_rate': 5.9931e-05, 'epoch': 0.06}


  0%|          | 24/20000 [02:05<55:34:17, 10.01s/it]

{'loss': 2.9335, 'learning_rate': 5.9928e-05, 'epoch': 0.06}


  0%|          | 25/20000 [02:07<42:19:51,  7.63s/it]

{'loss': 2.7629, 'learning_rate': 5.9925000000000006e-05, 'epoch': 0.06}


  0%|          | 26/20000 [02:09<33:13:00,  5.99s/it]

{'loss': 2.7309, 'learning_rate': 5.992200000000001e-05, 'epoch': 0.07}


  0%|          | 27/20000 [02:11<27:09:28,  4.90s/it]

{'loss': 2.5991, 'learning_rate': 5.991900000000001e-05, 'epoch': 0.07}


  0%|          | 28/20000 [02:13<22:48:22,  4.11s/it]

{'loss': 2.6223, 'learning_rate': 5.991600000000001e-05, 'epoch': 0.07}


  0%|          | 29/20000 [02:16<19:50:06,  3.58s/it]

{'loss': 2.6574, 'learning_rate': 5.991300000000001e-05, 'epoch': 0.07}


  0%|          | 30/20000 [02:18<17:23:08,  3.13s/it]

{'loss': 2.843, 'learning_rate': 5.991000000000001e-05, 'epoch': 0.07}


  0%|          | 31/20000 [02:20<15:51:32,  2.86s/it]

{'loss': 2.4956, 'learning_rate': 5.9907e-05, 'epoch': 0.08}


  0%|          | 32/20000 [02:22<14:47:07,  2.67s/it]

{'loss': 2.4139, 'learning_rate': 5.9904e-05, 'epoch': 0.08}


  0%|          | 33/20000 [02:24<13:44:58,  2.48s/it]

{'loss': 2.4411, 'learning_rate': 5.9901e-05, 'epoch': 0.08}


  0%|          | 34/20000 [02:27<13:30:05,  2.43s/it]

{'loss': 2.6778, 'learning_rate': 5.9898e-05, 'epoch': 0.09}


  0%|          | 35/20000 [02:29<12:54:35,  2.33s/it]

{'loss': 2.631, 'learning_rate': 5.9895e-05, 'epoch': 0.09}


  0%|          | 36/20000 [02:31<13:04:41,  2.36s/it]

{'loss': 2.5382, 'learning_rate': 5.9892e-05, 'epoch': 0.09}


  0%|          | 37/20000 [02:33<12:52:22,  2.32s/it]

{'loss': 2.4045, 'learning_rate': 5.9889e-05, 'epoch': 0.09}


  0%|          | 38/20000 [02:36<13:51:01,  2.50s/it]

{'loss': 2.4094, 'learning_rate': 5.9886e-05, 'epoch': 0.1}


  0%|          | 39/20000 [02:39<14:18:29,  2.58s/it]

{'loss': 2.43, 'learning_rate': 5.9883e-05, 'epoch': 0.1}


  0%|          | 40/20000 [02:42<14:36:41,  2.64s/it]

{'loss': 2.3817, 'learning_rate': 5.988e-05, 'epoch': 0.1}




KeyboardInterrupt: 