In [None]:
#@title License
# Copyright 2022 The Pix2Seq Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Pix2seq: A Language Modeling Framework for Object Detection
<a href="https://colab.research.google.com/github/google-research/pix2seq/blob/master/colabs/pix2seq_finetuning_object_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


This colab presents a demo for object detection fine-tuning with Pix2seq. The table below provides a summary and model location for pretrained models on Objects365 dataset, which can be used as initializations for fine-tuning.

Backbone       | Total params (M) | Image size | Google cloud storage location
-------------: | ---------------: | ---------: | -----------:
ResNet-50      | 36.6             | 640x640    | [gs://pix2seq/obj365_pretrain/resnet_640x640_b256_s400k](https://console.cloud.google.com/storage/browser/pix2seq/obj365_pretrain/resnet_640x640_b256_s400k)
ResNet-50 (C4) | 84.7             | 640x640    | [gs://pix2seq/obj365_pretrain/resnetc_640x640_b256_s400k](https://console.cloud.google.com/storage/browser/pix2seq/obj365_pretrain/resnetc_640x640_b256_s400k)
ViT-L          | 115.2            | 640x640    | [gs://pix2seq/obj365_pretrain/vit_b_640x640_b256_s400k](https://console.cloud.google.com/storage/browser/pix2seq/obj365_pretrain/vit_b_640x640_b256_s400k)
ViT-B          | 341.2            | 640x640    | [gs://pix2seq/obj365_pretrain/vit_l_640x640_b256_s400k](https://console.cloud.google.com/storage/browser/pix2seq/obj365_pretrain/vit_l_640x640_b256_s400k)


In [2]:
%cd /content/drive/MyDrive/Matority

/content/drive/MyDrive/Matority


In [3]:
# pip installs.
!pip install ml_collections
!pip install tensorflow-addons
# !git clone https://github.com/google/pix2seq.git

import os, sys

sys.path.append(os.getcwd())
root_dir = os.getcwd()
sys.path.insert(1, 'pix2seq')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ml_collections
  Downloading ml_collections-0.1.1.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 5.1 MB/s 
Building wheels for collected packages: ml-collections
  Building wheel for ml-collections (setup.py) ... [?25l[?25hdone
  Created wheel for ml-collections: filename=ml_collections-0.1.1-py3-none-any.whl size=94524 sha256=e977e3928173b9cf06f46e3f22fad6938e575415a69d9a3648c9e39704d5b96c
  Stored in directory: /root/.cache/pip/wheels/b7/da/64/33c926a1b10ff19791081b705879561b715a8341a856a3bbd2
Successfully built ml-collections
Installing collected packages: ml-collections
Successfully installed ml-collections-0.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.18.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |███

In [4]:
%cd /content/drive/MyDrive/Matority/pix2seq

/content/drive/MyDrive/Matority/pix2seq


In [5]:

import os, sys
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from PIL import Image
import requests
import json

import ml_collections
import utils
from data.dataset import Dataset
from models import model as model_lib
from models import ar_model
from tasks import task as task_lib
from tasks import object_detection

# Define a Dataset class to use for finetuning.
class VocDataset(Dataset):

  def extract(self, example, training):
    """Extracts needed features & annotations into a flat dictionary.

    Note: be consisous about 0 in label, which should probably reserved for
       special use (such as padding).

    Args:
      example: `dict` of raw features.
      training: `bool` of training vs eval mode.

    Returns:
      example: `dict` of relevant features and labels
    """


    feature_description = {
        'image/encoded': tf.io.VarLenFeature(tf.string),
        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
      }

    def _parse_function(example_proto):
      # Parse the input `tf.train.Example` proto using the dictionary above.
      return tf.io.parse_single_example(example_proto, feature_description)

    parsed = _parse_function(example)
    dense_img = tf.sparse.to_dense(parsed['image/encoded'])
    # print(tf.image.decode_image(dense_img, dtype=tf.float32))
    decoded_img = tf.io.decode_jpeg(dense_img[0], channels = 3)


    features = {
        'image': tf.image.convert_image_dtype(decoded_img, tf.float32),
        'image/id': 0, # dummy int.
    }

    # The following labels are needed by the object detection task.
    label = tf.sparse.to_dense(parsed['image/object/class/label']) + 1  # 0 is reserved for padding.
    xmax = tf.sparse.to_dense(parsed['image/object/bbox/xmax'])
    xmin = tf.sparse.to_dense(parsed['image/object/bbox/xmin'])
    ymax = tf.sparse.to_dense(parsed['image/object/bbox/ymax'])
    ymin = tf.sparse.to_dense(parsed['image/object/bbox/ymin'])
    bbox = tf.stack([ymin, xmin, ymax, xmax], axis=1)
    # print(bbox)

    # Use tf.numpy_function to get features not easily computed in tf.
    def get_area(bboxes):
      return np.asarray([
          (b[2] - b[0]) * (b[3] - b[1]) for b in bboxes], dtype=np.float32)

    areas = tf.numpy_function(get_area, (bbox,), (tf.float32,))
    areas = tf.reshape(areas, [tf.shape(label)[0]])

    labels = {
        'label': label,
        # 'xmax': xmax,
        # 'xmin': xmin,
        # 'ymax': ymax,
        # 'ymin': ymin,
        'bbox': bbox,
        'area': areas,
        'is_crowd': tf.zeros_like(label, tf.bool),
    }


    # features = {
    #     'image': tf.image.convert_image_dtype(example['image'], tf.float32),
    #     'image/id': 0, # dummy int.
    # }

    # # The following labels are needed by the object detection task.
    # label = example['objects']['label'] + 1  # 0 is reserved for padding.
    # bbox = example['objects']['bbox']

    # # Use tf.numpy_function to get features not easily computed in tf.
    # def get_area(bboxes):
    #   return np.asarray([
    #       (b[2] - b[0]) * (b[3] - b[1]) for b in bboxes], dtype=np.int32)

    # areas = tf.numpy_function(get_area, (bbox,), (tf.int32,))
    # areas = tf.reshape(areas, [tf.shape(label)[0]])

    # labels = {
    #     'label': label,
    #     'bbox': bbox,
    #     'area': areas,
    #     'is_crowd': tf.zeros_like(label, tf.bool),
    # }
    
    return features, labels



In [13]:

# Load config for the pretrained model.
pretrained_model_dir = 'gs://pix2seq/obj365_pretrain/resnetc_640x640_b256_s400k'
with tf.io.gfile.GFile(os.path.join(pretrained_model_dir, 'config.json'), 'r') as f:
  config = ml_collections.ConfigDict(json.loads(f.read()))


# loaded_dataset = tf.data.TFRecordDataset("/content/drive/MyDrive/Matority/data/color_fashion_tfrec_train")


# Update config for finetuning (some configs were missing at initial pretraining time).
config.dataset.tfds_name = 'voc'
# config.dataset.data_dir = "/content/drive/MyDrive/Matority/data/color_fashion_tfrec_train"
config.dataset.batch_duplicates = 1
config.dataset.coco_annotations_dir = '/content/drive/MyDrive/Matority/data'
config.dataset.train_filename = 'train_anno.json'
config.dataset.val_filename = 'test_anno.json'
config.training = True
config.task.name == 'object_detection'
config.task.vocab_id = 10  # object_detection task vocab id.
config.task.weight = 1.
config.task.max_instances_per_image_test = 10
config.tasks = [config.task]
config.train.batch_size = 2
config.model.name = 'encoder_ar_decoder'  # name of model and trainer in registries.
config.model.pretrained_ckpt = pretrained_model_dir
config.optimization.learning_rate = 1e-4
config.optimization.warmup_steps = 10

# Use a smaller image_size to speed up finetuning here.
# You can use any image_size of choice.
config.model.image_size = 600
config.task.image_size = 600


In [14]:
# Perform training for 1000 steps. This takes about ~20 minutes on a regular Colab GPU.
train_steps = 1000
use_tpu = False  # Set this accordingly.
steps_per_loop = 10
tf.config.run_functions_eagerly(False)

strategy = utils.build_strategy(use_tpu=use_tpu, master='')

# The following snippets are mostly copied and simplified from run.py.
with strategy.scope():
  # Get dataset.

  dataset = VocDataset(config)
  
  
  tmp_dataset = tf.data.TFRecordDataset("/content/drive/MyDrive/Matority/data/color_fashion_tfrec_train")
  num_train_examples = 0
  for i in tmp_dataset:
    num_train_examples += 1
  
  # Get task.
  task = task_lib.TaskRegistry.lookup(config.task.name)(config)
  tasks = [task]

  # Create tf.data.Dataset.
  ds = dataset.pipeline(
      process_single_example=task.preprocess_single,
      global_batch_size=config.train.batch_size,
      training=True)
  datasets = [ds]
  
  print("Data Pipeline Created!")
  
  # Setup training elements.
  trainer = model_lib.TrainerRegistry.lookup(config.model.name)(
      config, model_dir='model_dir',
      num_train_examples=num_train_examples, train_steps=train_steps)
  data_iterators = [iter(dataset) for dataset in datasets]

  print("Data Iterators Created!")

  @tf.function
  def train_multiple_steps(data_iterators, tasks):
    train_step = lambda xs, ts=tasks: trainer.train_step(xs, ts, strategy)
    for _ in tf.range(steps_per_loop):  # using tf.range prevents unroll.
      with tf.name_scope(''):  # prevent `while_` prefix for variable names.
        strategy.run(train_step, ([next(it) for it in data_iterators],))

  global_step = trainer.optimizer.iterations
  cur_step = global_step.numpy()
  while cur_step < train_steps:
    train_multiple_steps(data_iterators, tasks)
    cur_step = global_step.numpy()
    print(f"Done training {cur_step} steps.")

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Pre-extract
Post-extract
Data Pipeline Created!
Data Iterators Created!
Forward pass started
[(<tf.Tensor 'IteratorGetNext:0' shape=(2, 600, 600, 3) dtype=float32>, <tf.Tensor 'strided_slice_1:0' shape=(2, 500) dtype=int64>, <tf.Tensor 'strided_slice_2:0' shape=(2, 500) dtype=int64>, <tf.Tensor 'SelectV2_6:0' shape=(2, 500) dtype=float32>)]
before logits
after logits
Forward pass finished
Optimiziation started
Optimiziation finished
Forward pass started
[(<tf.Tensor 'IteratorGetNext:0' shape=(2, 600, 600, 3) dtype=float32>, <tf.Tensor 'strided_slice_1:0' shape=(2, 500) dtype=int64>, <tf.Tensor 'strided_slice_2:0' shape=(2, 500) dtype=int64>, <tf.Tensor 'SelectV2_6:0' shape=(2, 500) dtype=float32>)]
before logits
after logits
Forward pass finished
Optimiziation started
Optimiziation finished
Done training 10 steps.
Done training 20 steps.
Done training 30 steps.
Done training 40 steps.
Done training 50 st

In [15]:
eval_dataset = VocDataset(config)

# Create tf.data.Dataset.
eval_ds = eval_dataset.pipeline(
    process_single_example=task.preprocess_single,
    global_batch_size=config.train.batch_size,
    training=False)

Pre-extract
Post-extract


In [16]:
# Run one step of inference (on the training set).

# Set category names in task for visualization.
# The category names for COCO are picked up from the coco annotation files. For
# other datasets, they can be added manually in the code. If they are missing,
# the visualization will not contain category names for predicted boxes, but
# no other things will be impacted.
# category_names = [
#     'Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', 'Bus', 'Car', 'Cat',
#     'Chair', 'Cow', 'Dining table', 'Dog', 'Horse', 'Motorbike', 'People',
#     'Potted plant', 'Sheep', 'Sofa', 'Train', 'TV/monitor']
category_names = ['none', 'sunglass', 'hat', 'jacket', 'shirt', 'pants', 'shorts',
        'skirt', 'dress', 'bag', 'shoe']
task._category_names = {
    i + 1 : {'name': name} for i, name in enumerate(category_names)}

def single_step(examples):
  preprocessed_outputs = task.preprocess_batched(examples, training=False)
  infer_outputs = task.infer(trainer.model, preprocessed_outputs)
  return task.postprocess_tpu(*infer_outputs)

records = []
visuals = []
eval_steps = 0 #10


with strategy.scope():
  @tf.function
  def run_single_step(iterator):
    examples = next(iterator)
    outputs = strategy.run(single_step, (examples,))
    if outputs is not None:
      outputs = [strategy.gather(t, axis=0) for t in outputs]
    return outputs


  iterator = iter(eval_ds)
  cur_step = 0
  while True:
      print("eval step:", cur_step)
      if eval_steps and cur_step >= eval_steps:
        break
      try:
        per_step_outputs = run_single_step(iterator)
        vis = task.postprocess_cpu(
            per_step_outputs,
            train_step=100, # global_step.numpy(),
            eval_step=cur_step,
            ret_results=True)
        
        records.append(vis['records'])
        visuals.append(vis['pred'])
        cur_step += 1
      except tf.errors.OutOfRangeError:
        print('Break due to OutOfRangeError exception')
        break


  # per_step_outputs = run_single_step(iterator)
  # vis = task.postprocess_cpu(
  #     per_step_outputs,
  #     train_step=50,
  #     eval_step=0,
  #     ret_results=True)
  # records.append(vis['records'])


for record in records:
  for key in record.keys():
    record[key] = record[key].numpy()
  

eval step: 0
eval step: 1
eval step: 2
eval step: 3
eval step: 4
eval step: 5
eval step: 6
eval step: 7
eval step: 8
eval step: 9
eval step: 10
eval step: 11
eval step: 12
eval step: 13
eval step: 14
eval step: 15
eval step: 16
eval step: 17
eval step: 18
eval step: 19
eval step: 20
eval step: 21
eval step: 22
eval step: 23
eval step: 24
eval step: 25
eval step: 26
eval step: 27
eval step: 28
eval step: 29
eval step: 30
eval step: 31
eval step: 32
eval step: 33
eval step: 34
eval step: 35
eval step: 36
eval step: 37
eval step: 38
eval step: 39
eval step: 40
eval step: 41
eval step: 42
eval step: 43
eval step: 44
eval step: 45
eval step: 46
eval step: 47
eval step: 48
eval step: 49
eval step: 50
eval step: 51
eval step: 52
eval step: 53
eval step: 54
eval step: 55
eval step: 56
eval step: 57
eval step: 58
eval step: 59
eval step: 60
eval step: 61
eval step: 62
eval step: 63
eval step: 64
eval step: 65
eval step: 66
eval step: 67
eval step: 68
eval step: 69
eval step: 70
eval step: 71
ev

In [17]:
print(len(records))

268


In [18]:
import pickle
out_fname = 'resnet50c_eval_10-24-15-07'
with open('outputs/'+out_fname+'.pickle', 'wb') as f:
    pickle.dump(records, f)
  

In [24]:
import pickle
#out_fname = 'resnet50c_eval_10-24-15-07'
out_fname = 'resnet50_eval_10-24-12-59'
with open('outputs/'+out_fname+'.pickle', 'rb') as f:
    loaded_recs = pickle.load(f)


In [None]:
!pip install mapcalc

In [25]:
from mapcalc import calculate_map, calculate_map_range

batch_size = config.train.batch_size #8

ap = 0
ap_50 = 0
ap_75 = 0
n_samples = 0

for rec in loaded_recs:
  for i in range(batch_size):
    n_boxes = np.count_nonzero(rec['gt_classes'][i])
    ground_truth = {
      'boxes': rec['gt_bboxes'][i][:n_boxes],
      'labels': rec['gt_classes'][i][:n_boxes]}
    result_dict = {
        'boxes': rec['pred_bboxes'][i][:n_boxes],
        'labels': rec['pred_classes'][i][:n_boxes],
        'scores': rec['scores'][i][:n_boxes]}
    
    n_samples += 1
    
    ap += calculate_map_range(ground_truth, result_dict, 0.5, 0.95, 0.05)
    ap_50 += calculate_map(ground_truth, result_dict, 0.5)
    ap_75 += calculate_map(ground_truth, result_dict, 0.75)

print('Model:', out_fname)
print("AP: {:.4f}, AP50: {:.4f}, AP75: {:.4f}".format(ap / n_samples, ap_50 / n_samples, ap_75 / n_samples))

Model: resnet50_eval_10-24-12-59
AP: 0.5616, AP50: 0.7944, AP75: 0.6060


In [None]:
# summary_writer = tf.summary.create_file_writer('model_dir/')
# eval_tag = config.eval.tag

# cur_step = global_step.numpy()
# result = task.evaluate(summary_writer, cur_step, eval_tag)
# result.update({'global_step': cur_step})
# print(result)

In [35]:
# Visualization.
vis = visuals[0]
im = tf.concat([vis[i] for i in range(config.train.batch_size)], 0)
Image.fromarray(np.uint8(im.numpy() * 255))

Output hidden; open in https://colab.research.google.com to view.

In [None]:
%cd /content/drive/MyDrive/Matority/pix2seq

/content/drive/MyDrive/Matority/pix2seq


In [None]:
!git checkout master   
!git branch main master -f    
!git checkout main  
!git push origin main -f 

M	colabs/Copy of Pix2Seq Finetuning Object Detection.ipynb
M	data/__pycache__/dataset.cpython-37.pyc
M	models/__pycache__/ar_model.cpython-37.pyc
M	tasks/__pycache__/object_detection.cpython-37.pyc
Already on 'master'
M	colabs/Copy of Pix2Seq Finetuning Object Detection.ipynb
M	data/__pycache__/dataset.cpython-37.pyc
M	models/__pycache__/ar_model.cpython-37.pyc
M	tasks/__pycache__/object_detection.cpython-37.pyc
Switched to branch 'main'
Total 0 (delta 0), reused 0 (delta 0)
To https://github.com/mehrdadsaberi/pix2seq-M.git
 + 6d45f77...0db54da main -> main (forced update)


In [None]:
!git init

Initialized empty Git repository in /content/drive/MyDrive/Matority/pix2seq/.git/


In [None]:
!git remote set-url origin https://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@github.com/mehrdadsaberi/pix2seq-M.git

In [None]:
!git commit -m "outputs fixed"

fatal: Unable to create '/content/drive/MyDrive/Matority/pix2seq/.git/index.lock': File exists.

Another git process seems to be running in this repository, e.g.
an editor opened by 'git commit'. Please make sure all processes
are terminated then try again. If it still fails, a git process
may have crashed in this repository earlier:
remove the file manually to continue.


In [None]:
!git push origin main

Counting objects: 13, done.
Delta compression using up to 2 threads.
Compressing objects:   7% (1/13)   Compressing objects:  15% (2/13)   Compressing objects:  23% (3/13)   Compressing objects:  30% (4/13)   Compressing objects:  38% (5/13)   Compressing objects:  46% (6/13)   Compressing objects:  53% (7/13)   Compressing objects:  61% (8/13)   Compressing objects:  69% (9/13)   Compressing objects:  76% (10/13)   Compressing objects:  84% (11/13)   Compressing objects:  92% (12/13)   Compressing objects: 100% (13/13)   Compressing objects: 100% (13/13), done.
Writing objects:   7% (1/13)   Writing objects:  15% (2/13)   Writing objects:  23% (3/13)   Writing objects:  30% (4/13)   Writing objects:  38% (5/13)   Writing objects:  46% (6/13)   Writing objects:  53% (7/13)   Writing objects:  61% (8/13)   Writing objects:  69% (9/13)   Writing objects:  76% (10/13)   Writing objects:  84% (11/13)   Writing objects:  92% (12/13)   Writing objects: 100% (13/13)  

In [None]:
!git config --global user.email "merhdads@gmail.com"
!git config --global user.name "mehrdadsaberi"

In [None]:
!rm -rf ~

In [None]:
!git add . -v

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [None]:
!du -shc *

105K	architectures
3.3M	colabs
22K	configs
1.5K	CONTRIBUTING.md
81K	data
12K	LICENSE
22K	metrics
55K	models
12M	pix2seq.gif
237K	pix2seq.png
16K	__pycache__
8.0K	README.md
1.5K	registry.py
512	requirements.txt
9.5K	run.py
7.0K	sample_run.py
215K	tasks
12K	utils.py
2.0K	vocab.py
16M	total
