# **CSE 4095 Vision Transformer Project**

There is a lot of opportunity to automate the process of detecting violence through our surveilance cameras. Our goal was to find if there was a way to automate the discovery of people carrying guns in public to better help police understand where to focus their attention in times of danger.

We used a transformer model since we believed they would provide some benefits over other methods. Namely the attention mechanism could have a better time understanding the context of the environment. The DETR model is an encoder-decoder transformer with a convolutional backbone. The encoder extract meaningful representations of the image, and the decoder generates predictions on if there is a human with a gun based on these representations.
<br />
<br />

![](https://cdn-thumbnails.huggingface.co/social-thumbnails/models/facebook/detr-resnet-50.png)

#Loading the Dataset
We use an API key to load our firearm object detection dataset from a Roboflow workspace.

In [None]:
# load in the coco data from roboflow for person detection
# !pip install roboflow
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="Hx4hLh7HJ91dQsTehTZK")
project = rf.workspace("playground-wxriu").project("firearm-detection-5ioov")
version = project.version(1)
dataset = version.download("coco")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Firearm-Detection-1 to coco:: 100%|██████████| 72019/72019 [00:03<00:00, 22420.82it/s]





Extracting Dataset Version Zip to Firearm-Detection-1 in coco:: 100%|██████████| 1192/1192 [00:00<00:00, 2872.57it/s]


# Arranging files
The following block of code arranges all images and annotations in a way that is convenient for the DETR framework

In [None]:
!mkdir -p /content/data/custom/annotations/

!mv /content/Firearm-Detection-1/train/_annotations.coco.json /content/data/custom/annotations/custom_train.json
!mv /content/Firearm-Detection-1/test/_annotations.coco.json /content/data/custom/annotations/custom_val.json

!mkdir -p /content/data/custom/train2017/

!mv /content/Firearm-Detection-1/train/*.jpg /content/data/custom/train2017/

!mkdir -p /content/data/custom/val2017/

!mv /content/Firearm-Detection-1/test/*.jpg /content/data/custom/val2017/

dataDir='/content/data/custom/'
dataType='train2017'
annFile='{}annotations/custom_train.json'.format(dataDir)

mv: cannot stat '/content/Firearm-Detection-1/train/_annotations.coco.json': No such file or directory
mv: cannot stat '/content/Firearm-Detection-1/test/_annotations.coco.json': No such file or directory
mv: cannot stat '/content/Firearm-Detection-1/train/*.jpg': No such file or directory
mv: cannot stat '/content/Firearm-Detection-1/test/*.jpg': No such file or directory


# Important imports

The following code blocks cover all remaining imports for the code.

In [None]:
import torch, torchvision
import torchvision.transforms as T
from PIL import Image
import requests
%matplotlib inline
import pycocotools.coco as coco
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab

pylab.rcParams['figure.figsize'] = (10.0, 8.0)
print(torch.__version__, torch.cuda.is_available())
torch.set_grad_enabled(False);

2.2.1+cu121 True


# Function setup

Sets up important functions for the code.

In [None]:
# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def filter_bboxes_from_outputs(outputs,
                               threshold=0.7):
  # keep only predictions with confidence above threshold
  probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
  keep = probas.max(-1).values > threshold
  probas_to_keep = probas[keep]
  # convert boxes from [0; 1] to image scales
  bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
  return probas_to_keep, bboxes_scaled

In [None]:
%cd /content/

!rm -rf detr
!git clone https://github.com/woctezuma/detr.git

%cd detr/

!git checkout finetune

/content
Cloning into 'detr'...
remote: Enumerating objects: 265, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 265 (delta 4), reused 6 (delta 3), pack-reused 254[K
Receiving objects: 100% (265/265), 325.43 KiB | 7.40 MiB/s, done.
Resolving deltas: 100% (127/127), done.
/content/detr
Already on 'finetune'
Your branch is up to date with 'origin/finetune'.


In [None]:
# Get pretrained weights
checkpoint = torch.hub.load_state_dict_from_url(
            url='https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth',
            map_location='cpu',
            check_hash=True)

# Remove class weights
del checkpoint["model"]["class_embed.weight"]
del checkpoint["model"]["class_embed.bias"]

# Save
torch.save(checkpoint,
           'detr-r50_no-class-head.pth')

In [None]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

loading annotations into memory...


FileNotFoundError: [Errno 2] No such file or directory: '/content/data/custom/annotations/custom_train.json'

In [None]:
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('Categories: {}'.format(nms))
nms = set([cat['supercategory'] for cat in cats])
print('Super-categories: {}'.format(nms))

In [None]:
# load and display image
catIds = coco.getCatIds(catNms=['gun']);
imgIds = coco.getImgIds(catIds=catIds);

img_id = imgIds[np.random.randint(0,len(imgIds))]
print('Image n°{}'.format(img_id))

img = coco.loadImgs(img_id)[0]

img_name = '%s/%s/%s'%(dataDir, dataType, img['file_name'])
print('Image name: {}'.format(img_name))

I = io.imread(img_name)
plt.figure()
plt.imshow(I)

In [None]:
annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds)
anns = coco.loadAnns(annIds)
plt.imshow(I)
coco.showAnns(anns, draw_bbox=True)

In [None]:
first_class_index = 0
num_classes = 2
finetuned_classes = [
    'Firearm',
    'gun'
]

print('First class index: {}'.format(first_class_index))
print('Parameter num_classes: {}'.format(num_classes))
print('Fine-tuned classes: {}'.format(finetuned_classes))

In [None]:
%cd /content/detr/

LOADING SAVED MODEL WEIGHTS!!

In [None]:
model = torch.hub.load('facebookresearch/detr',
                       'detr_resnet50',
                       pretrained=False,
                       num_classes=num_classes)

checkpoint = torch.load('checkpoint.pth',
                        map_location='cpu')

model.load_state_dict(checkpoint['model'],
                      strict=False)

model.eval();

...TO FINE-TUNE

In [None]:
!python main.py \
  --dataset_file "custom" \
  --coco_path "/content/data/custom/" \
  --output_dir "outputs" \
  --resume "detr-r50_no-class-head.pth" \
  --num_classes $num_classes \
  --epochs 15

In [None]:
model = torch.hub.load('facebookresearch/detr',
                       'detr_resnet50',
                       pretrained=False,
                       num_classes=num_classes)

checkpoint = torch.load('outputs/checkpoint.pth',
                        map_location='cpu')

model.load_state_dict(checkpoint['model'],
                      strict=False)

model.eval();

In [None]:
# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

def plot_finetuned_results(pil_img, prob=None, boxes=None):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    if prob is not None and boxes is not None:
      for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), colors):
          ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                    fill=False, color=c, linewidth=3))
          cl = p.argmax()
          text = f'{finetuned_classes[cl]}: {p[cl]:0.2f}'
          ax.text(xmin, ymin, text, fontsize=15,
                  bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()

def run_worflow(my_image, my_model):
  # mean-std normalize the input image (batch-size: 1)
  img = transform(my_image).unsqueeze(0)

  # propagate through the model
  outputs = my_model(img)
  for threshold in [0.9,]:
    probas_to_keep, bboxes_scaled = filter_bboxes_from_outputs(outputs,
                                                              threshold=threshold)
    plot_finetuned_results(my_image,
                           probas_to_keep,
                           bboxes_scaled)

In [None]:
img_name = '/content/data/custom/train2017/100.rf.21be8bfacac0dc677f00640161e8643e.jpg'
im = Image.open(img_name)
run_worflow(im,
            model)

In [None]:
img_name = '/content/data/custom/val2017/real-video-360-.rf.21daaa9da4acc88a0dbc2f69eaa74674.jpg'
im = Image.open(img_name)

run_worflow(im,
            model)

Saving the model

In [None]:
torch.save(model.state_dict(), 'model_weights.pth')

# **Future Work**
We plan to train this model again on for longer and with a better dataset(s).