# Hands-on tutorial for DETR

In this notebook, we show-case how to:
* use the pre-trained models that we provide to make predictions
* visualize the attentions of the model to gain insights on the way it sees the images.

## Preliminaries
This section contains the boilerplate necessary for the other sections. Run it first.

In [None]:
import math

from sklearn import preprocessing
from scipy.spatial import distance as dist
import cv2

from PIL import Image
import requests
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import ipywidgets as widgets
from IPython.display import display, clear_output

import torch
from torch import nn
from torchvision.models import resnet50
import torchvision.transforms as T
torch.set_grad_enabled(False);

In [None]:
# COCO classes
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

In [None]:
# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

# Detection - using a pre-trained model from TorchHub

In this section, we show-case how to load a model from hub, run it on a custom image, and print the result.
Here we load the simplest model (DETR-R50) for fast inference. You can swap it with any other model from the model zoo.

In [None]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval();

Using cache found in /root/.cache/torch/hub/facebookresearch_detr_master


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
im_path = '/content/gdrive/MyDrive/Colab Notebooks/persontasks.txt'

majorbox = '/content/gdrive/MyDrive/Colab Notebooks/majorbox.txt'
minorbox = '/content/gdrive/MyDrive/Colab Notebooks/minorbox.txt'
major_f=open(majorbox,'a+')
minor_f=open(minorbox,'a+')

countfile = '/content/gdrive/MyDrive/Colab Notebooks/count.txt'
count_f = open(countfile,'a+')

In [None]:
def weight_score(conv_features,keep,probas,im_name):
  min_max_scaler = preprocessing.MinMaxScaler()

  # get the feature map shape
  h, w = conv_features['0'].tensors.shape[-2:]
  flag=0
  count=0
  idxlist=[]

  weight_score={}

  for idx in keep.nonzero():
    curobj=CLASSES[probas[idx].argmax()]
    
    if curobj=='person':
      X_train=dec_attn_weights[0, idx].view(h, w)
      X_train_minmax = min_max_scaler.fit_transform(X_train)
      idxlist.append(flag)
      flag=flag+1
      a = torch.tensor(X_train_minmax)
      mask = a.gt(0.5)
      
      num=0
      for m in range(0,mask.shape[0]):
        for n in range(0,mask.shape[1]):
          if mask[m][n]==True:
            num=num+1

      weight_score[count]=num
      count=count+1
    else:
      flag=flag+1

  #print(weight_score)
  print(count)
  count_f.write(im_name+','+str(count)+'\n')

  return weight_score,idxlist,count

In [None]:
def distance_score(img_name,idxlist,count):
  # 读取图像
  realimg = cv2.imread('/content/gdrive/MyDrive/Colab Notebooks/person/'+img_name)
  h=realimg.shape[0]
  w=realimg.shape[1]
  xA=w/2.0
  yA=h/2.0

  box_score={}

  for j in range(0,count):
    index=idxlist[j]
    (xmin, ymin, xmax, ymax)=bboxes_scaled[index]
    xB=xmin+(xmax-xmin)/2.0
    yB=ymin+(ymax-ymin)/2.0
    D = dist.euclidean((xA, yA), (xB, yB))
    box_score[j]=D

  #print(box_score)

  return box_score

  


In [None]:
def total_score(weight_score,box_score,idxlist,count):
  score={}
  major=[]
  minor=[]
  for i in range(count):
    score[i]=0

  sort_weight=sorted(weight_score.items(), key=lambda item:item[1])
  print(sort_weight)
  sort_box=sorted(box_score.items(), key=lambda item:item[1])
  print(sort_box)

  scorenum=count
  for q in range(count-1,-1,-1): 
    if q==count-1:
      index=sort_weight[q][0]
      score[index]=scorenum
    else:
      index=sort_weight[q][0]
      if sort_weight[q][1]==sort_weight[q+1][1]:
        score[index]=scorenum
      else:
        scorenum=scorenum-1
        score[index]=scorenum

  #print(score)

  scorenum=count
  for p in range(0,count): 
    if p==0:
      index=sort_weight[p][0]
      score[index]=score[index]*0.7+scorenum*0.3
    else:
      index=sort_weight[p][0]
      if sort_weight[p][1]==sort_weight[p-1][1]:
        score[index]=score[index]*0.7+scorenum*0.3
      else:
        scorenum=scorenum-1
        score[index]=score[index]*0.7+scorenum*0.3

  #print(score)

  sort_score=sorted(score.items(), key=lambda item:item[1])
  #print(sort_score)

  if count<=2:
    top=count-1
  elif count>10:
    top=count-1
  else:
    top=count-int(count*0.4)

  for t in range(count-1,top-1,-1):
    print(sort_score[t][0])
    major.append(sort_score[t][0])

  for j in range(count):
    if j not in major:
      minor.append(j)

  print(major)
  print(minor)
  return major,minor

In [None]:


for line in open(im_path):
  img_name = line.strip('\n')
  im = Image.open('/content/gdrive/MyDrive/Colab Notebooks/person/'+img_name)

  # mean-std normalize the input image (batch-size: 1)
  img = transform(im).unsqueeze(0)

  # propagate through the model
  outputs = model(img)

  # keep only predictions with 0.7+ confidence
  probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
  keep = probas.max(-1).values > 0.9

  # convert boxes from [0; 1] to image scales
  bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)

  #plot_results(im, probas[keep], bboxes_scaled)

  # use lists to store the outputs via up-values
  conv_features, enc_attn_weights, dec_attn_weights = [], [], []

  hooks = [model.backbone[-2].register_forward_hook(lambda self, input, output: conv_features.append(output)),
       model.transformer.encoder.layers[-1].self_attn.register_forward_hook(lambda self, input, output: enc_attn_weights.append(output[1])),
       model.transformer.decoder.layers[-1].multihead_attn.register_forward_hook(lambda self, input, output: dec_attn_weights.append(output[1])),]

  # propagate through the model
  outputs = model(img)

  for hook in hooks:
    hook.remove()

  # don't need the list anymore
  conv_features = conv_features[0]
  enc_attn_weights = enc_attn_weights[0]
  dec_attn_weights = dec_attn_weights[0]

  w_score,id_list,p_num= weight_score(conv_features,keep,probas,img_name)

  d_score=distance_score(img_name,id_list,p_num)

  major_list,minor_list=total_score(w_score,d_score,id_list,p_num)
  print(id_list)

  for i in major_list:
    index=id_list[i]
    (xmin, ymin, xmax, ymax)=bboxes_scaled[index]
    box=img_name+","+str(xmin.item())+","+str(ymin.item())+","+str(xmax.item())+","+str(ymax.item())+"\n"
    print(box)
    major_f.write(box)

  for j in minor_list:
    index=id_list[j]
    (xmin, ymin, xmax, ymax)=bboxes_scaled[index]
    box=img_name+","+str(xmin.item())+","+str(ymin.item())+","+str(xmax.item())+","+str(ymax.item())+"\n"
    minor_f.write(box)

major_f.close()
minor_f.close()
count_f.close()





4
[(1, 113), (2, 116), (3, 186), (0, 199)]
[(1, 43.192467692816024), (2, 87.77296224296956), (0, 183.21204186172966), (3, 184.69143686095674)]
0
[0]
[1, 2, 3]
[3, 6, 7, 11]
00.jpg,75.68751525878906,155.69503784179688,227.14047241210938,466.7680358886719
7
[(6, 93), (3, 107), (5, 113), (2, 115), (1, 127), (0, 128), (4, 142)]
[(4, 78.25323834762847), (6, 130.33917315248817), (0, 225.12526854501843), (3, 245.30480235138654), (5, 259.3638680520867), (2, 285.2132262690672), (1, 348.2382263740306)]
4
0
[4, 0]
[1, 2, 3, 5, 6]
[0, 1, 2, 4, 5, 6, 7]
01.jpg,295.7816162109375,187.15301513671875,420.0859680175781,544.8167114257812
01.jpg,196.33692932128906,167.08984375,212.1751251220703,210.50796508789062
4
[(2, 99), (0, 137), (1, 158), (3, 185)]
[(2, 73.92649540767248), (0, 94.35785752018889), (3, 167.6304901995397), (1, 190.7848613998717)]
3
[3]
[0, 1, 2]
[1, 2, 4, 7]
02.jpg,492.5950012207031,141.24703979492188,628.597900390625,554.8521728515625
3
[(1, 101), (0, 134), (2, 161)]
[(2, 91.068862428