In [None]:
!wandb sync --sync-all --include-synced

# Prerequisites

In [None]:
# mex_path = '/content/drive/MyDrive/MEX'
mex_path = '/app/MEX'

In [None]:
## Categories sample scrape
# {1: 'title', 2: 'main-photo', 3: 'breadcrumb', 4: 'price', 5: 'image-slider', 6: 'description'}

## Categories CoVA
categories = {0: 'price', 1: 'title', 2: 'image'}

In [None]:
# id2label = {1: 'title', 2: 'main-photo', 3: 'breadcrumb', 4: 'price', 5: 'image-slider', 6: 'description'}
id2label = categories
label2id = {v: k for k, v in id2label.items()}

In [None]:
!nvidia-smi

In [None]:
%pip install datasets[vision] \
   git+https://github.com/CreatlV/transformers.git@fix/deformable-detr-post-processing-hard-coded-value \
   timm evaluate matplotlib \
   torchmetrics[detection] ipywidgets wandb \
   python-dotenv scipy


In [None]:
import datasets
import torch
from transformers import DetrImageProcessor, DetrForObjectDetection
from transformers import ConditionalDetrImageProcessor, ConditionalDetrForObjectDetection
from transformers import DeformableDetrImageProcessor, DeformableDetrForObjectDetection
from huggingface_hub import notebook_login
from transformers import TrainingArguments

import random

import datetime
from evaluate import load
import os
import wandb
from dotenv import load_dotenv

from display_utils import show_image_bbox_dataset, plot_results_all, detr_plot_results1
import dataset_utils
from evaluation import run_evaluation_cond
from CustomTrainer import CustomCDetrTrainer, get_collate_fn
from torch.utils.data import DataLoader


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
load_dotenv()

# Data preparation

### CoVA - Load from source

In [None]:
%%capture 
#Prevent output

# Set the path to the zip file
zip_path = mex_path + '/imgs.zip'

# Set the path to the folder where the contents of the zip file will be extracted
extract_path = '/content/images'

# Unzip the file
!unzip "{zip_path}" -d "{extract_path}"

In [None]:
metadata_path = mex_path + '/covametadata.jsonl' 
data_path = extract_path + '/imgs' + '/metadata.jsonl'

print(metadata_path, data_path)

# Copy metadata file
! cp {metadata_path} {data_path}

In [None]:
dataset = datasets.load_dataset("imagefolder", data_dir="./images/imgs")

In [None]:
## Filter out data points without annotations

# print("From: ", len(dataset))
# keep = [i for i in range(len(dataset)) if len(dataset[i]["objects"]["bbox"]) > 0]
# dataset = dataset.select(keep)
# print("To: ", len(dataset))

In [None]:
## Make sure it contains features: ['image', 'image_id', 'height', 'width', 'objects', 'annotations'],
dataset

In [None]:
dataset = dataset.map(dataset_utils.transform_rgb, batched=True)

In [None]:
## Folds 1..5 are available

path = mex_path + "/splits/Fold-1"

# Read the image_ids from the text files
with open(path + "/train_imgs.txt", "r") as f:
    train_image_ids = set([line.strip() for line in f.readlines()])

with open(path + "/val_imgs.txt", "r") as f:
    val_image_ids = set([line.strip() for line in f.readlines()])

with open(path + "/test_imgs.txt", "r") as f:
    test_image_ids = set([line.strip() for line in f.readlines()])

# Define a function to filter the dataset based on image_ids
def filter_dataset_by_ids(example, image_ids):
    return example["image_id"] in image_ids

In [None]:
train_dataset = dataset.filter(lambda x: filter_dataset_by_ids(x, train_image_ids))
val_dataset = dataset.filter(lambda x: filter_dataset_by_ids(x, val_image_ids))
test_dataset = dataset.filter(lambda x: filter_dataset_by_ids(x, test_image_ids))

In [None]:
dataset["train"] = train_dataset
dataset["val"] = val_dataset
dataset["test"] = test_dataset

In [None]:
dataset.save_to_disk(mex_path + "/cova-dataset")

### CoVa - Load dataset drive

In [None]:
dataset = datasets.load_from_disk("/content/drive/MyDrive/MEX/cova-dataset")

### Load from HUB

In [None]:
# Get huggingface token from env variable
token =  os.getenv('HUGGINGFACE_TOKEN')

In [None]:
from huggingface_hub import HfApi, HfFolder
api=HfApi()
api.set_access_token(token)
folder = HfFolder()
folder.save_token(token)

In [None]:
dataset = datasets.load_dataset("CreatlV/cova-coco-v2")

In [None]:
dataset

### Choose feature extraction

In [None]:
feature_extractor = DetrImageProcessor()

In [None]:
feature_extractor = ConditionalDetrImageProcessor()

In [None]:
feature_extractor = DeformableDetrImageProcessor()

### General

In [None]:
dataset

In [None]:
show_image_bbox_dataset(random.randint(0, len(dataset["train"]) - 1), dataset["train"], categories)

In [None]:
def transform(example_batch):
    images = example_batch["image"]
    all_annotations = example_batch["annotations"]
    targets = [
        {"image_id": annotations[0]["image_id"], "annotations": annotations} for annotations in all_annotations
    ]
    return feature_extractor(images=images, annotations=targets, return_tensors="pt")

In [None]:
dataset_train = dataset["train"].with_transform(transform)
dataset_test = dataset["test"].with_transform(transform)
dataset_val = dataset["val"].with_transform(transform)

In [None]:
dataset_train

In [None]:
dataset_test

In [None]:
dataset_val

# Fine-tuning

In [None]:
now = datetime.datetime.now()
date_time_str = now.strftime("%Y-%m-%d_%H:%M:%S")

### Deformable DETR

In [None]:
output_path = mex_path + "/deformable-detr-resnet-50_fine_tuned_cova_" + date_time_str

In [None]:
model = DeformableDetrForObjectDetection.from_pretrained(
    "SenseTime/deformable-detr",
    num_labels=len(categories.keys()),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    num_queries=20
)

### Conditional DETR

In [None]:
output_path = mex_path + "/conditional-detr-resnet-50_fine_tuned_cova_" + date_time_str

In [None]:
## Use pretrained model
model = ConditionalDetrForObjectDetection.from_pretrained(
    "microsoft/conditional-detr-resnet-50",
    num_labels=len(categories.keys()),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    num_queries=20,
    backbone="resnet101",
    use_pretrained_backbone=True,
    use_timm_backbone=True,    
)


In [None]:
## Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
from transformers import ConditionalDetrConfig
config = ConditionalDetrConfig(
    num_labels=len(categories.keys()),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    num_queries=20,
    )

model = ConditionalDetrForObjectDetection(config)

### Common

In [None]:
fine_tuning_training_args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    logging_steps=50,
    max_grad_norm=0.1,
    weight_decay=10e-4,
    evaluation_strategy="steps",
    eval_steps=1000,
    # learning_rate=1e-4,
    remove_unused_columns=False,
)

In [None]:
# Create the custom optimizer with grouped parameters

retrain_training_args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=200,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=1000,
    weight_decay=10e-4,
    max_grad_norm=0.1, # Gradient clipping
    # learning_rate=1e-4,
    remove_unused_columns=False
)

In [None]:
from typing import Literal


training_mode: Literal["retrain"] | Literal["finetune"] = "finetune"

if training_mode == "retrain":
    training_args = retrain_training_args
elif training_mode == "finetune":
    training_args = fine_tuning_training_args

# training_args.report_to = []
training_args.report_to = ["wandb"]

print(dataset_train)

trainer = CustomCDetrTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=feature_extractor,
)

In [None]:
trainer.train()

In [None]:
notebook_login()

In [None]:
model.push_to_hub("conditional-detr-resnet-50_fine_tuned_cova_v2_40_epochs")

In [None]:
trainer.push_to_hub()

# Model inference and evaluation

In [None]:
### Download folder from google drive

# model_checkpoint_inference = output_path + "/checkpoint-10400"
model_checkpoint_inference_path = "/conditional-detr-resnet-50_fine_tuned_cova_2023-04-22_14:57:20"
model_checkpoint = "/checkpoint-31500"
model_checkpoint_inference = mex_path + model_checkpoint_inference_path + model_checkpoint



In [None]:
huggingface_link = "CreatlV" + model_checkpoint_inference_path.replace(":", ".")
print(huggingface_link)

In [None]:
## COND DETR model_checkpoint
model_checkpoint_inference = "CreatlV/conditional-detr-resnet-50_fine_tuned_cova_v2"

In [None]:
### Conditional DETR
model = ConditionalDetrForObjectDetection.from_pretrained(
    model_checkpoint_inference,
    num_labels=len(categories.keys()),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    num_queries=20,
)

In [None]:
### DETR on huggingface
model = DetrForObjectDetection.from_pretrained(
    huggingface_link,
    num_labels=len(categories.keys()),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    num_queries=100,
)

In [None]:
dataset

## Examples from test dataset

In [None]:
test_image_index = random.randint(0, len(dataset_test) - 1)
# test_image_index = 0

# print(outputs)
outputs = model(
    dataset_test[test_image_index]["pixel_values"].unsqueeze(-4),
    dataset_test[test_image_index]["pixel_mask"].unsqueeze(-3),
    return_dict=True,
)

print(outputs.logits.size())

# rescale bounding boxes
target_sizes = torch.tensor(
    dataset["test"][test_image_index]["image"].size[::-1]
).unsqueeze(0)

In [None]:
## DETR

probas = outputs.logits.softmax(-1)[0, :, :]
postprocessed_outputs = feature_extractor.post_process_object_detection(
    outputs, 0, target_sizes
)
bboxes_scaled = postprocessed_outputs[0]["boxes"]
detr_plot_results1(
    dataset["test"][test_image_index]["image"], probas, bboxes_scaled, model
)

In [None]:
## COND DETR

postprocessed_outputs = feature_extractor.post_process_object_detection(
        outputs, 0.0, target_sizes
    )
plot_results_all(
    dataset["test"][test_image_index]["image"],
    postprocessed_outputs[0]["boxes"],
    postprocessed_outputs[0]["labels"],
    postprocessed_outputs[0]["scores"],
    model=model,
)

In [None]:
dataset["test"][0]

In [None]:
show_image_bbox_dataset(0, dataset["test"], categories=categories)

## Test upload example

In [None]:
single = datasets.load_dataset("imagefolder", data_dir="/app/single")

In [None]:
single

In [None]:
single = single.map(dataset_utils.transform_rgb, batched=True)

In [None]:
def transform_test(example_batch):
    images = example_batch["image"]
    return feature_extractor(images=images, return_tensors="pt")

In [None]:
single["train"][0]

In [None]:
dataset_single = single["train"].with_transform(transform_test)

In [None]:
dataset_single[0]

In [None]:
outputs = model(dataset_single[0]['pixel_values'].unsqueeze(-4), dataset_single[0]['pixel_mask'].unsqueeze(-3))

In [None]:
target_sizes = torch.tensor(single["train"][0]["image"].size[::-1]).unsqueeze(0)
postprocessed_outputs = feature_extractor.post_process_object_detection(outputs, 0.8, target_sizes)
plot_results_all(
  single["train"][0]["image"],
  postprocessed_outputs[0]['boxes'],
  postprocessed_outputs[0]['labels'],
  postprocessed_outputs[0]['scores'],
  model=model
  )

## mAP calculation

### Custom EVAL - Experiment

In [None]:
## CURRENT DETR EVALUATION CAN ONLY WORK WITH BATCH SIZE 1
## Conditional DETR can work with batch size > 1
collate_fn = get_collate_fn(feature_extractor)


test_dataloader = DataLoader(dataset_test, collate_fn=collate_fn, batch_size=6)
# test_dataloader = DataLoader(dataset_test, collate_fn=collate_fn, batch_size=1)

### BOTH

In [None]:
device = torch.device("cuda")

run_evaluation_cond(model, feature_extractor, test_dataloader, device, plot=False)


### Trainer evaluation

In [None]:
now = datetime.datetime.now()
date_time_str = now.strftime("%Y-%m-%d_%H:%M:%S")

output_path = mex_path + "/EVALUATE-conditional-detr-resnet-50_fine_tuned_cova_" + date_time_str

training_args = TrainingArguments(
    output_dir=output_path,
    per_device_eval_batch_size=16,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [None]:
training_args.report_to = []

trainer = CustomCDetrTrainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_test,
    tokenizer=feature_extractor,
)

In [None]:
trainer.evaluate()