# AI Engine

# Import and functions

In [15]:
import torch
import cv2
import os
import time
import datetime
import math
from torch.utils.data import Dataset, DataLoader
import sys
from PIL import Image
import torchvision.transforms.functional as transform
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.ssd import SSDClassificationHead
from torch.utils.tensorboard import SummaryWriter
import numpy
from matplotlib import pyplot as plt
os.environ['KMP_DUPLICATE_LIB_OK']='True'

sys.path.insert(0, './pytorchutils/')
from pytorchutils.engine import train_one_epoch, evaluate
import pytorchutils.utils
import pytorchutils.multibox_loss

#!pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

# Calculates the Intersection Over Union for two specified bounding boxes
def calc_iou(bb1, bb2):
    # Get the coordinates of the intersecting box
    inter_x = max(bb1[0], bb2[0])
    inter_y = max(bb1[1], bb2[1])
    inter_x2 = min(bb1[2], bb2[2])
    inter_y2 = min(bb1[3], bb2[3])
    
    if inter_x2 < inter_x or inter_y2 < inter_y:
        return 0.0
    
    inter_area = (inter_x2 - inter_x) * (inter_y2 - inter_y)

    # If intersection area is or lower than 0 we dont have an intersection
    #if inter_area <= 0:
    #    return 0.0

    bb1_area = (bb1[2] - bb1[0]) * (bb1[3] - bb1[1])
    bb2_area = (bb2[2] - bb2[0]) * (bb2[3] - bb2[1])
    iou = inter_area / float(bb1_area + bb2_area - inter_area)
    return iou

def calc_intersection(bb1, bb2):
    inter_x = max(bb1[0], bb2[0])
    inter_y = max(bb1[1], bb2[1])
    inter_x2 = min(bb1[2], bb2[2])
    inter_y2 = min(bb1[3], bb2[3])
    
    if inter_x2 < inter_x or inter_y2 < inter_y:
        return 0.0
    
    return (inter_x2 - inter_x) * (inter_y2 - inter_y)

# Calculates the area of a bounding box
def calc_area(bb):
    return (bb[2] - bb[0]) * (bb[3] - bb[1])

# Padds a bounding box by a specific number, doubles the padding if text is specified
def pad_bb(bb, pad, text=False):
    x,y,x2,y2 = bb
    if text:
        return [x-pad*2, y-pad, x2+pad*2, y2+pad]
    return [x-pad, y-pad, x2+pad, y2+pad]

# Returns the smallest bounding box between two specified boxes
def return_smallest(bb1, bb2):
    bb1_x,bb1_y,bb1_x2,bb1_y2 = bb1
    bb2_x,bb2_y,bb2_x2,bb2_y2 = bb2
    bb1_size = (bb1_x2-bb1_x)*(bb1_y2-bb1_y)
    bb2_size = (bb2_x2-bb2_x)*(bb2_y2-bb2_y)
    
    return bb2 if bb1_size > bb2_size else bb1

# Gets the bounding boxes from an image by processing the image
def get_bbs_from_image(im, clean=True, pad=0, text=False):
    imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_LIST , cv2.CHAIN_APPROX_SIMPLE)
    bbs = []
    for cntr in contours:
        x,y,w,h = cv2.boundingRect(cntr)
        #cv2.rectangle(im, (x, y), (x+w, y+h), (0, 255, 255), 2)
        if x > 40 and y > 40 and y < im.shape[:2][1]-40 and x < im.shape[:2][0]-40:
            bbs.append([x,y,x+w,y+h])
                
    t_bbs = []
    [t_bbs.append(x) for x in bbs if x not in t_bbs]
    
    t_bbs = remove_small_bb_list(t_bbs, 300)
    combined_bbs = combine_bb_list(t_bbs, pad=pad, text=text)
    
    if clean:
        return clean_bb_list(combined_bbs, pad=pad)
    
    return combined_bbs

def combine_bb_list(bb_list, pad=0, text=False):
    bbs = bb_list.copy()
    iou_non_zero = True
    while iou_non_zero:
        iou_non_zero = False
        for i in range(len(bbs)-1):
            for c in range(i, len(bbs)):
                if bbs[i] == bbs[c]:
                    continue
                    
                iou = calc_iou(pad_bb(bbs[i], pad, text=text), bbs[c])
                
                if iou != 0:
                    iou_non_zero = True
                    bb = combine_bb(bbs[i], bbs[c])
                    bb1 = bbs[i].copy()
                    bb2 = bbs[c].copy()
                    
                    bbs.remove(bb1)
                    bbs.remove(bb2)
                    bbs.append(bb)
                    break;
            if iou_non_zero:
                break;
    return bbs

def combine_bb(bb1, bb2):
    bb1_x,bb1_y,bb1_x2,bb1_y2 = bb1
    bb2_x,bb2_y,bb2_x2,bb2_y2 = bb2

    if bb2_x < bb1_x:
        bb1_x = bb2_x
    if bb2_y < bb1_y:
        bb1_y = bb2_y
    if bb2_x2 > bb1_x2:
        bb1_x2 = bb2_x2
    if bb2_y2 > bb1_y2:
        bb1_y2 = bb2_y2
        
    return [bb1_x, bb1_y, bb1_x2, bb1_y2]

def clean_bb_list(bb_list, pad=0, text=False):
    bbs = bb_list.copy()
    iou_non_zero = True
    while iou_non_zero:
        iou_non_zero = False
        for i in range(len(bbs)):
            if i == len(bbs)-1:
                break;
                
            iou = calc_iou(pad_bb(bbs[i], pad, text=text), bbs[i+1])

            if iou == 0:
                continue

            iou_non_zero = True
            bb = return_smallest(bbs[i], bbs[i+1])
            bbs.remove(bb)
            break;
                
    return bbs

def remove_small_bb_list(bb_list, size):
    cleaned_list = []
    for bb in bb_list:
        x,y,x2,y2 = bb
        w = x2-x
        h = y2-y
        if w*h > size:
            cleaned_list.append(bb)
            
    return cleaned_list

# Normalizes a pixel specific bounding box [x, y, x2, y2] to normalized bounding box [x, y, w, h]
def normalize_bb(bb, shape):
    h_img,w_img = shape
    x,y,x2,y2 = bb
    norm_w,norm_h = [(x2-x)/w_img, (y2-y)/h_img]
    return [((x+x2)/2)/w_img, ((y+y2)/2)/h_img, norm_w, norm_h]

# Denormalizes a normalized bounding box [x, y, w, h] to pixel specific bounding box [x, y, x2, y2]
def denormalize_bb(bb, shape):
    h_img,w_img = shape
    x,y,w,h = bb
    x_min,y_min = [int(x*w_img-(w*w_img)/2), int(y*h_img-(h*h_img)/2)]
    return [x_min, y_min, x_min+int(w*w_img), y_min+int(h*h_img)]

# Stringifies a bounding box for output
def bb_to_str(bb):
    return str(bb[0])+' '+str(bb[1])+' '+str(bb[2])+' '+str(bb[3])

# Destringifies a bounding box
def str_to_bb(bb_str):
    str_arr = bb_str.split(' ')
    return [float(str_arr[0]), float(str_arr[1]), float(str_arr[2]), float(str_arr[3]), float(str_arr[4])]

# Generates dataset structure by generating boundingbox labels, spliting data into train and validition sets
# also providing the found boundingboxes for verification of labeling being successfull 
def generate_dataset(root_folder, labels=[], split_components=True, train_val_ratio=0.8):
    os.mkdir('./'+root_folder+'_generated/')
    os.mkdir('./'+root_folder+'_generated/images/')
    os.mkdir('./'+root_folder+'_generated/images/train/')
    os.mkdir('./'+root_folder+'_generated/images/val/')
    os.mkdir('./'+root_folder+'_generated/images/bbs/')
    os.mkdir('./'+root_folder+'_generated/labels/')

    if (split_components):    
        for component in os.listdir('./'+root_folder):
            os.mkdir('./'+root_folder+'_generated/images/train/'+component+'/')
            os.mkdir('./'+root_folder+'_generated/images/val/'+component+'/')
            images = os.listdir('./'+root_folder+'/'+component)
            for i in range(len(images)):
                image = images[i]
                img_type = 'val' if i > math.floor(len(images)*train_val_ratio) else 'train'
                im = cv2.imread('./'+root_folder+'/'+component+'/'+image)
                cv2.imwrite('./'+root_folder+'_generated/images/'+img_type+'/'+component+'/'+image, im)
                bbs = get_bbs_from_image(im, clean=True, pad=30, text=True)
                bbs_str = '' 
                for bb in bbs:
                    bbs_str += str(labels[component])+' '+bb_to_str(normalize_bb(bb, im.shape[:2]))+'\n'
                    x,y,x2,y2 = pad_bb(bb, 5)
                    cv2.rectangle(im, (x, y), (x2, y2), (0, 0, 255), 2)
                cv2.imwrite('./'+root_folder+'_generated/images/bbs/'+image, im)
                f = open('./'+root_folder+'_generated/labels/'+image[:-3]+"txt", "a")
                f.write(bbs_str[:-1])
                f.close()
    else:
        images = os.listdir('./'+root_folder)
        for i in range(len(images)):
            image = images[i]
            img_type = 'val' if i > math.floor(len(images)*train_val_ratio) else 'train'
            im = cv2.imread('./'+root_folder+'/'+image)
            cv2.imwrite('./'+root_folder+'_generated/images/'+img_type+'/'+image, im)
            bbs = get_bbs_from_image(im, clean=True, pad=30, text=True)
            
            bbs_str = '' 
            c = 0
            for bb in bbs:
                c = c + 1
                bbs_str += str(c)+' '+bb_to_str(normalize_bb(bb, im.shape[:2]))+'\n'
                x,y,x2,y2 = pad_bb(bb, 5)
                cv2.rectangle(im, (x, y), (x2, y2), (0, 0, 255), 2)
                cv2.putText(im, str(c), (int((x+x2)/2)-50,int((y+y2)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)
            
            cv2.imwrite('./'+root_folder+'_generated/images/bbs/'+image, im)
            f = open('./'+root_folder+'_generated/labels/'+image[:-3]+"txt", "a")
            f.write(bbs_str[:-1])
            f.close()
            
def predict_and_save(model, root_dir, save_dir, labels=[], threshold=0.5, IoU=0, mask=False):
    # Check if the 4th final character is a dot aka if the input directory is a file
    if root_dir[-4] == '.':
        im = root_dir
        img = cv2.imread(im)
        cv2_img = cv2.imread(im)
        if mask:
            imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            ret, img = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
        tensor_img = torch.tensor(transform.to_tensor(img))
        tensor_img = torch.reshape(tensor_img, (1, tensor_img.size(0), tensor_img.size(1), tensor_img.size(2)))

        predictions = model(tensor_img)
        print(predictions)
        dont_print_id = []
        for i in range(len(predictions[0]['boxes'])):
            score = predictions[0]['scores'][i].item()
            if IoU > 0:
                if i in dont_print_id:
                    continue
                bb1 = predictions[0]['boxes'][i].detach().numpy()
                for c in range(i, len(predictions[0]['boxes'])):
                    bb2 = predictions[0]['boxes'][c].detach().numpy()
                    if calc_intersection(bb1, bb2) > calc_area(bb1)*IoU:
                        dont_print_id.append(c)
            if score > threshold:
                x,y,x2,y2 = predictions[0]['boxes'][i].detach().numpy()
                cv2.rectangle(cv2_img, (int(x), int(y)), (int(x2), int(y2)), (0, 0, 255), 2)
                cv2.putText(cv2_img, str(score*100)[:5], (int((x+x2)/2)-200,int((y+y2)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)
                
                if len(labels) > 1:
                    cv2.putText(cv2_img, labels[predictions[0]['labels'][i].item()-1], (int((x+x2)/2)-250,int((y+y2)/2)-150), cv2.FONT_HERSHEY_SIMPLEX, 4, (0, 255, 255), 5, cv2.LINE_AA)

        cv2.imwrite(save_dir+(root_dir.split("/")[-1]), cv2_img)
    else:
        for image in os.listdir(root_dir):
            im = root_dir+image
            img = Image.open(im)
            cv2_img = cv2.imread(im)
            tensor_img = torch.tensor(transform.to_tensor(img))
            tensor_img = torch.reshape(tensor_img, (1, tensor_img.size(0), tensor_img.size(1), tensor_img.size(2)))

            predictions = model(tensor_img)
            print(predictions)
            for i in range(len(predictions[0]['boxes'])):
                score = predictions[0]['scores'][i].item()
                if score > threshold:
                    x,y,x2,y2 = predictions[0]['boxes'][i].detach().numpy()
                    cv2.rectangle(cv2_img, (int(x), int(y)), (int(x2), int(y2)), (0, 0, 255), 2)
                    cv2.putText(cv2_img, str(score*100)[:5], (int((x+x2)/2)-200,int((y+y2)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)

                    if len(labels) > 1:
                        cv2.putText(cv2_img, labels[predictions[0]['labels'][i].item()-1], (int((x+x2)/2)-250,int((y+y2)/2)-150), cv2.FONT_HERSHEY_SIMPLEX, 4, (0, 255, 255), 5, cv2.LINE_AA)

            cv2.imwrite(save_dir+image, cv2_img)

## Preprocessing

In [5]:
'IMG20220127120628'
'IMG20220127120506'
'IMG20220127120913'

'IMG20220127120913'

In [16]:
im = cv2.imread('./whiteboard website.jpg')
bbs = get_bbs_from_image(im, clean=True, pad=10, text=True)

for bb in bbs:
    x,y,x2,y2 = pad_bb(bb, 5)
    cv2.rectangle(im, (x, y), (x2, y2), (0, 0, 255), 2)

#r = 800 / float(im.shape[:2][1])
#im = cv2.resize(im, (int(im.shape[:2][0] * r), 800))
cv2.imwrite('whiteboard website BB.jpg', im)
#cv2.waitKey()

True

In [17]:
f = open('./dataset/labels.txt', "r")
data = f.read().split('\n')
f.close()
labels = {data[i]:i for i in range(len(data))}
time_before = time.time()
generate_dataset('temp/big_data', labels)
print(str(datetime.timedelta(seconds=round(time.time()-time_before))))

KeyboardInterrupt: 

In [42]:
im = cv2.imread('./new_more_dataset/Checkbox/IMG20220209145109.jpg')
#bbs = get_bbs_from_image(im, clean=True, pad=30, text=True)

#f = open("test_bb.txt", "w")
#for bb in bbs:
#    norm_bb = normalize_bb(bb, im.shape[:2])
#    print(bb,'->',norm_bb)
#    f.write("0 " + bb_to_str(norm_bb) + ('\n' if bbs.index(bb) != len(bbs)-1 else ''))
#
#f.close()

f = open("./new_more_dataset_bb/labels/IMG20220209145109.txt", "r")
for bb in f.read().split('\n'):
    denorm_bb = denormalize_bb(str_to_bb(bb)[1:], im.shape[:2])
    x,y,w,h = pad_bb(denorm_bb, 5)
    cv2.rectangle(im, (x, y), (w, h), (0, 0, 255), 2)
    cv2.putText(im, '1', (int((x+w)/2)-50,int((y+h)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)
    
f.close()
cv2.imwrite('checkbox test.jpg', im)
#cv2.waitKey()
#f.close()

True

### Yolo formating to box testing

In [26]:
im = cv2.imread('./image_unsorted/IMG20220129155542.jpg')
#h,w = im.shape[:2]
label = [8, 0.556466, 0.402322, 0.479053, 0.086066]
x,y,w,h = denormalize_bb(label[1:], im.shape[:2])
print(x,y,w,h)
cv2.rectangle(im, (x,y), (w, h), (0,0,255), 2)
cv2.imwrite('test2.jpg', im)
#cv2.waitKey()

1669 1609 3106 1953


True

### Mark all in a folder

In [412]:
images = os.listdir('./image_unsorted')
for image in images:
    im = cv2.imread('./image_unsorted/'+image)
    bbs = get_bbs_from_image(im, clean=True, pad=30, text=True)
    for bb in bbs:
        x,y,x2,y2 = pad_bb(bb, 5)
        cv2.rectangle(im, (x, y), (x2, y2), (0, 0, 255), 2)
    cv2.imwrite('./image_unsorted_bb/'+image, im)

## Object detection

### Faster RCNN

In [6]:
class FasterRCNNSketchDataset(Dataset):
    def __init__(self, root_dir, set_type, single_component=False):
        self.images = []
        self.labels = []
        self.component_names = []
        if not single_component:
            for component in os.listdir(root_dir+"/images/"+set_type):
                self.component_names.append(component)
                for image in os.listdir(root_dir+"/images/"+set_type+"/"+component):
                    self.images.append(root_dir+"/images/"+set_type+"/"+component+"/"+image)
                    self.labels.append(root_dir+"/labels/"+(image.split('.')[0]+'.txt'))
        else:
            self.component_names.append(single_component)
            for image in os.listdir(root_dir+"/images/"+set_type+"/"+single_component):
                self.images.append(root_dir+"/images/"+set_type+"/"+single_component+"/"+image)
                self.labels.append(root_dir+"/labels/"+(image.split('.')[0]+'.txt'))
            
        self.root = root_dir
        self.single_component = single_component
        
    def __len__(self):
        return len(self.images)-1
    
    def __getitem__(self, idx):
        im = cv2.imread(self.images[idx])
        imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        ret, img = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
        #img = cv2.cvtColor(img_tresh, cv2.COLOR_GRAY2RGB)
        #normImg = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        
        f = open(self.labels[idx], "r")
        data = f.read().split('\n')
        f.close()

        N = len(data)
        boxes = torch.zeros([N, 4], dtype=torch.float)
        labels = torch.zeros([N], dtype=torch.int64)
        areas = torch.zeros([N])
        
        for i in range(N):
            bb = denormalize_bb(str_to_bb(data[i])[1:], img.shape[:2])
            boxes[i][0],boxes[i][1],boxes[i][2],boxes[i][3] = bb
            areas[i] = calc_area(bb)
            
            if not self.single_component:
                labels[i] = int(data[i][0])+1
                continue
                
            labels[i] = 1
                
        return transform.to_tensor(img), {'boxes':boxes, 'labels':labels, 'image_id':torch.LongTensor([idx]), 'area':areas, 'iscrowd':torch.zeros([N], dtype=torch.int64)}


def train_model(model, optimizer, data_loader, data_loader_val, device, num_epochs, model_type, model_name, lr_scheduler=False):
    writer = SummaryWriter()
    total_time = time.time()
    
    date = datetime.datetime.now().strftime("%b-%d_%H-%M")
    if not os.path.exists('./models/'+model_type+'/'+date+'/'):
        os.mkdir('./models/'+model_type+'/'+date+'/')

    for epoch in range(num_epochs):
        epoch_time = time.time()
        epoch_loss = []
        batch_nr = 0
        
        for images, targets in data_loader:
            batch_time = time.time()
            
            # Send them to device if using GPU
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            pred = model(images, targets)
            losses = sum(loss for loss in pred.values())
            
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            epoch_loss.append(losses.item())
            
            writer.add_scalars(model_type+'_'+model_name, {
                'train_loss': losses.item(),
            }, epoch*len(data_loader)+batch_nr)
            
            batch_nr = batch_nr + 1
            print(
                '\r[Train] Epoch {} [{}/{}] - Loss: {} \tProgress [{}%] \tEpoch time elapsed: {}'.format(
                    epoch+1, batch_nr, len(data_loader), losses.item(), round(((epoch/num_epochs)+(1/num_epochs*batch_nr/len(data_loader)))*100, 2), str(datetime.timedelta(seconds=round(time.time()-epoch_time)))
                ),
                end=''
            )
        
            
        writer.add_scalars(model_type+'_'+model_name, {
            'avg_epoch_loss': numpy.average(epoch_loss),
        }, (epoch+1))
            
        if lr_scheduler:
            lr_scheduler.step()
        
        print()
        evaluate_model(model, data_loader_val, device, writer, model_type, model_name, epoch+1)
        #model.train()
        print()
        
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, './models/'+model_type+'/'+date+'/'+model_name+'-'+str(epoch+1)+'.pt')

    print(
        '\rTraining completed! Loss: {} \tTotal time elapsed: {}'.format(
            losses.item(), str(datetime.timedelta(seconds=round(time.time()-total_time)))
        ),
        end=''
    )
    
def evaluate_model(model, data_loader, device, writer, model_type, model_name, epoch):
    with torch.no_grad():
        epoch_time = time.time()
        avg_loss = []
        batch_nr = 0
        for images, targets in data_loader:
            # Send them to device if using GPU
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            pred = model(images, targets)
            losses = sum(loss for loss in pred.values())
            avg_loss.append(losses.item())
            
            batch_nr = batch_nr + 1
            print(
                '\r[Val] [{}/{}] - Loss: {} \tEpoch time elapsed: {}'.format(
                    batch_nr, len(data_loader), losses.item(), str(datetime.timedelta(seconds=round(time.time()-epoch_time)))
                ),
                end=''
            )

        writer.add_scalars(model_type+'_'+model_name, {
            'val_loss': numpy.average(avg_loss),
        }, epoch)
                
                

In [9]:
#dataset_train = FasterRCNNSketchDataset('./dataset', 'train')
#dataset_val = FasterRCNNSketchDataset('./dataset', 'val')

dataset_train = FasterRCNNSketchDataset('./dataset', 'train')
dataset_val = FasterRCNNSketchDataset('./dataset', 'val')

data_loader = torch.utils.data.DataLoader(
        dataset_train, batch_size=5, shuffle=True, num_workers=0,
        collate_fn=pytorchutils.utils.collate_fn)
data_loader_val = torch.utils.data.DataLoader(
        dataset_val, batch_size=2, shuffle=False, num_workers=0,
        collate_fn=pytorchutils.utils.collate_fn)

device = torch.device('cpu')#torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 13
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, 
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

train_model(model, optimizer, data_loader, data_loader_val, device, 7, 'Faster-RCNN', 'All-Comp_v4_SGD-StepLR', lr_scheduler)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[Train] Epoch 1 [3/72] - Loss: 0.35160717368125916 	Progress [0.6%] 	Epoch time elapsed: 0:02:19

KeyboardInterrupt: 

In [7]:
model.eval()
im ="./components.jpg"
image = "components.jpg"
#label = "./dataset/labels/"+(image.split('.')[0]+'.txt')
imgR = cv2.imread(im)
cv2_img = cv2.imread(im)
imgray = cv2.cvtColor(imgR, cv2.COLOR_BGR2GRAY)
ret, img = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
tensor_img = torch.tensor(transform.to_tensor(img))
tensor_img = torch.reshape(tensor_img, (1, tensor_img.size(0), tensor_img.size(1), tensor_img.size(2)))

predictions = model(tensor_img)
print(predictions)

for i in range(len(predictions[0]['boxes'])):
    score = predictions[0]['scores'][i].item()
    if score > 0.5:
        x,y,x2,y2 = predictions[0]['boxes'][i].detach().numpy()
        cv2.rectangle(cv2_img, (int(x), int(y)), (int(x2), int(y2)), (0, 0, 255), 2)
        cv2.putText(cv2_img, str(score*100)[:5], (int((x+x2)/2)-200,int((y+y2)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)

        print('Faster-RCNN scores for bbs:', score)
cv2.imwrite('./results/input-rcnn/treshold-'+image, cv2_img)

  if __name__ == '__main__':


[{'boxes': tensor([[1498.8979,  440.7491, 2635.7700,  925.8327],
        [1671.9338, 1396.2307, 2609.1558, 1840.4751],
        [ 384.7604, 2191.0439, 1274.4561, 2615.8643],
        [1751.2051, 2046.9060, 2542.6919, 2358.1477],
        [ 232.9498, 1143.3949, 1021.7910, 1524.4823],
        [1674.9076, 2003.4613, 2608.4773, 2747.4641],
        [ 248.2850, 1527.6628,  940.2410, 1858.7635],
        [  45.3643,  689.6697, 1078.6929, 1179.3794],
        [1730.3732, 2401.5906, 2543.0100, 2734.2966],
        [ 211.7554, 1162.8032, 1062.4457, 1917.1985],
        [1700.6423, 2238.8347, 2578.4087, 2591.6472],
        [ 120.0673,  992.3790, 1072.0443, 1627.6317],
        [  90.8544,  721.5524, 1092.3401,  818.2288],
        [ 100.1335,  728.2725, 1125.5175,  949.4650],
        [1656.1731, 1784.4149, 2516.7114, 2143.3918],
        [1541.0342,  411.2225, 2771.5647, 2294.4272],
        [ 265.9842, 1421.7349, 1119.7715, 1751.7324],
        [1722.5110, 2288.2554, 2556.0334, 2466.1270],
        [1606.837

True

In [11]:
model_all.eval()
f = open('./dataset/labels.txt', "r")
data = f.read().split('\n')
f.close()
labels = [data[i] for i in range(len(data))]

predict_and_save(model_all, './whiteboard website.jpg', './results/all-rcnn/treshold/', labels=labels, threshold=0.5, IoU=0.3, mask=True)



[{'boxes': tensor([[1783.3864, 2026.4620, 2546.8032, 2330.3635],
        [ 500.4083,  875.5958, 1492.7535, 1217.0627],
        [1757.9944,  900.5966, 2455.7290, 1256.8989],
        [ 531.2827, 1385.3551, 1605.5140, 3323.6848],
        [ 410.3445, 1207.7856, 2723.2495, 3191.3757],
        [ 474.0057,  183.6731, 1663.3750, 1212.9236],
        [ 499.0638, 1353.7314, 1672.7955, 3171.4692],
        [ 603.2289,  242.6476, 1480.7758,  622.3920],
        [1690.5858, 1610.7550, 2904.0471, 1857.0385],
        [1756.4015,  906.1067, 2541.8386, 1250.3099],
        [ 531.0112,  316.0464, 1669.2383, 1282.9100],
        [ 481.7568, 1260.1385, 2934.2300, 3167.8040],
        [1234.6787,  834.4679, 2935.7051, 1981.5819],
        [1678.0809,  963.8422, 2850.2180, 2378.3411],
        [ 459.6611,  158.2962, 1693.2533,  999.1198],
        [ 557.6180,  877.3840, 1577.7971, 1226.6521],
        [1743.1876, 1590.8253, 1949.9871, 1802.0271],
        [1729.7051, 1600.1277, 1932.2109, 1796.7493],
        [1723.452

### SSD

In [95]:
class SSDSketchDataset(Dataset):
    def __init__(self, root_dir, set_type, single_component=False):
        self.images = []
        self.labels = []
        self.component_names = []
        if not single_component:
            for component in os.listdir(root_dir+"/images/"+set_type):
                self.component_names.append(component)
                for image in os.listdir(root_dir+"/images/"+set_type+"/"+component):
                    self.images.append(root_dir+"/images/"+set_type+"/"+component+"/"+image)
                    self.labels.append(root_dir+"/labels/"+(image.split('.')[0]+'.txt'))
        else:
            self.component_names.append(single_component)
            for image in os.listdir(root_dir+"/images/"+set_type+"/"+single_component):
                self.images.append(root_dir+"/images/"+set_type+"/"+single_component+"/"+image)
                self.labels.append(root_dir+"/labels/"+(image.split('.')[0]+'.txt'))
            
        self.root = root_dir
        self.single_component = single_component
        
    def __len__(self):
        return len(self.images)-1
    
    def __getitem__(self, idx):
        img = cv2.imread(self.images[idx])
        #imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        #ret, img_tresh = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
        #img = cv2.cvtColor(img_tresh, cv2.COLOR_GRAY2RGB)
        #normImg = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        
        f = open(self.labels[idx], "r")
        data = f.read().split('\n')
        f.close()

        N = len(data)
        boxes = torch.zeros([N, 4], dtype=torch.float)
        labels = torch.zeros([N], dtype=torch.int64)
        areas = torch.zeros([N])
        
        for i in range(N):
            bb = denormalize_bb(str_to_bb(data[i])[1:], img.shape[:2])
            boxes[i][0],boxes[i][1],boxes[i][2],boxes[i][3] = bb
            
            if not self.single_component:
                labels[i] = int(data[i][0])+1
                continue
                
            labels[i] = 1
                
        return transform.to_tensor(img), {'boxes':boxes, 'labels':labels}                

In [117]:
#dataset_train = FasterRCNNSketchDataset('./dataset', 'train')
#dataset_val = FasterRCNNSketchDataset('./dataset', 'val')

dataset_train = SSDSketchDataset('./dataset', 'train')
dataset_val = SSDSketchDataset('./dataset', 'val')

data_loader = torch.utils.data.DataLoader(
        dataset_train, batch_size=5, shuffle=True, num_workers=0,
        collate_fn=pytorchutils.utils.collate_fn)
data_loader_val = torch.utils.data.DataLoader(
        dataset_val, batch_size=2, shuffle=False, num_workers=0,
        collate_fn=pytorchutils.utils.collate_fn)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ssd = torchvision.models.detection.ssd300_vgg16(pretrained=True)

num_classes = 13
in_channels = [512, 1024, 512, 256, 256, 256]
num_anchors = [4, 6, 6, 6, 4, 4]
model_ssd.head.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes)


model_ssd.to(device)


params = [p for p in model_ssd.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, 
                            momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

train_model(model_ssd, optimizer, data_loader, data_loader_val, device, 7, 'SSD', 'AllComp_v3_SGD-StepLR', lr_scheduler)

[Train] Epoch 1 [6/72] - Loss: nan 	Progress [1.19%] 	Epoch time elapsed: 0:00:46: 0:00:39:00:30

KeyboardInterrupt: 

In [27]:
model_ssd.eval()

im ="./components.jpg"
image ="components.jpg"
#label = "./dataset/labels/"+(image.split('.')[0]+'.txt')
img = cv2.imread(im)
cv2_img = cv2.imread(im)
#imgray = cv2.cvtColor(imgR, cv2.COLOR_BGR2GRAY)
#ret, img = cv2.threshold(imgray, 100, 255, cv2.THRESH_BINARY_INV)
tensor_img = torch.tensor(transform.to_tensor(img))
tensor_img = torch.reshape(tensor_img, (1, tensor_img.size(0), tensor_img.size(1), tensor_img.size(2)))

predictions = model_ssd(tensor_img)

#pred = model_ssd(dataset_train[0][0])
print(predictions)

for i in range(len(predictions[0]['boxes'])):
    score = predictions[0]['scores'][i].item()
    if score > 0.5:
        x,y,x2,y2 = predictions[0]['boxes'][i].detach().numpy()
        cv2.rectangle(cv2_img, (int(x), int(y)), (int(x2), int(y2)), (0, 0, 255), 2)
        cv2.putText(cv2_img, str(score*100)[:5], (int((x+x2)/2)-200,int((y+y2)/2)), cv2.FONT_HERSHEY_SIMPLEX, 5, (0, 255, 255), 5, cv2.LINE_AA)

        print('Faster-RCNN scores for bbs:', score)
cv2.imwrite('./results/all-ssd/'+image, cv2_img)

  # Remove the CWD from sys.path while we load stuff.


[{'boxes': tensor([[ 893.3196, 1647.6664, 2066.8818, 2961.3760],
        [1756.8842, 1239.7880, 2076.8975, 1519.0928],
        [ 791.0107, 1241.5090, 2697.6719, 2285.1575],
        [1053.7938,    0.0000, 1488.4951, 1137.0680],
        [2144.0461, 1058.5895, 2431.6250, 1755.1636],
        [2361.3345,  961.9185, 2580.6604, 1573.9338],
        [1109.5116,  939.8868, 1727.2297, 1578.8533],
        [1984.7820, 1090.2424, 2211.6326, 1639.1440],
        [2145.4233, 2433.6357, 2665.4685, 3517.1460],
        [ 951.5978,  299.9469, 1718.7595,  870.7869],
        [1057.4376,  492.3156, 1273.1948, 2936.8867],
        [1163.5031,  162.9524, 1477.0153, 2667.8718],
        [1005.8318, 1389.4165, 1240.4116, 3302.3110],
        [1276.8878,    0.0000, 1710.3306,  890.1877],
        [2208.7209,  588.4589, 2589.9365, 1778.2502],
        [1200.8909, 1624.9567, 2564.8635, 2391.3796],
        [ 812.0310,  802.6419, 1193.2656, 3180.7698],
        [ 131.1717, 2241.9607,  358.6216, 3605.3259],
        [1387.226

Faster-RCNN scores for bbs: 0.8755373954772949
Faster-RCNN scores for bbs: 0.875408947467804
Faster-RCNN scores for bbs: 0.8743588924407959
Faster-RCNN scores for bbs: 0.8739598393440247
Faster-RCNN scores for bbs: 0.8731203675270081
Faster-RCNN scores for bbs: 0.8707557320594788
Faster-RCNN scores for bbs: 0.8690065741539001


True

## Loading model

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_all = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)

num_classes = 13
in_features = model_all.roi_heads.box_predictor.cls_score.in_features
model_all.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model_all.to(device)

params = [p for p in model_all.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, weight_decay=0.0005)

checkpoint = torch.load('./models/Faster-RCNN/All_v4/All-Comp_v4_SGD-StepLR-7.pt')
model_all.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model_all.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

## Optical character recognition


In [3]:
import torchvision.models.detection.ssd
torchvision.models.detection.ssd

<module 'torchvision.models.detection.ssd' from 'C:\\Users\\Cryslacks\\anaconda3\\envs\\nnlm\\lib\\site-packages\\torchvision\\models\\detection\\ssd.py'>

## Post processing engine

In [12]:
def mixup_data(x, y, alpha=1.0, use_cuda=False):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = numpy.clip(numpy.random.beta(alpha, alpha), 0.4, 0.6)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)
        
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y.numpy(), y[index].numpy()
    mixedup_bboxes = []
    for bbox, s_bbox in zip(y_a, y_b):
        mixedup_bboxes.append([bbox,s_bbox])
        
    return mixed_x, index, torch.tensor(mixedup_bboxes)

In [15]:
im1 = cv2.imread('./new_more_dataset/Checkbox/IMG20220209145109.jpg')
im2 = cv2.imread('./new_more_dataset/Button/IMG20220209143703.jpg')
im3 = cv2.imread('./new_more_dataset/Header/IMG20220209144530.jpg')
im4 = cv2.imread('./new_more_dataset/List/IMG20220209145958_01.jpg')
#image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(numpy.float32)
#image /= 255.0

mix = mixup_data(torch.tensor(numpy.array([im1, im2])), [[1],[2]])
#print(mix.numpy())
#cv2.imshow('img', mix.numpy())
cv2.imwrite('./Looool1.jpg', mix[0].numpy()[0])
cv2.imwrite('./Looool2.jpg', mix[0].numpy()[1])
cv2.imwrite('./Looool3.jpg', mix[0].numpy()[2])
cv2.imwrite('./Looool4.jpg', mix[0].numpy()[3])

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [64]:
# Copy right of https://github.com/sunshiding/ssd-pytorch-custom/blob/7d036770a6ec616fa7374bc8c798a3fd05888b33/layers/modules/multibox_loss.py
import torch
import torch.nn as nn

class MultiBoxLoss(nn.Module):
    """SSD Weighted Loss Function
    Compute Targets:
        1) Produce Confidence Target Indices by matching  ground truth boxes
           with (default) 'priorboxes' that have jaccard index > threshold parameter
           (default threshold: 0.5).
        2) Produce localization target by 'encoding' variance into offsets of ground
           truth boxes and their matched  'priorboxes'.
        3) Hard negative mining to filter the excessive number of negative examples
           that comes with using a large number of default bounding boxes.
           (default negative:positive ratio 3:1)
    Objective Loss:
        L(x,c,l,g) = (Lconf(x, c) + aLloc(x,l,g)) / N
        Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
        weighted by a which is set to 1 by cross val.
        Args:
            c: class confidences,
            l: predicted boxes,
            g: ground truth boxes
            N: number of matched default boxes
        See: https://arxiv.org/pdf/1512.02325.pdf for more details.
    """

    def __init__(self, num_classes, overlap_thresh, prior_for_matching,
                 bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
                 device):
        super(MultiBoxLoss, self).__init__()
        self.use_gpu = False if device.type == 'cpu' else True
        self.device = device
        self.num_classes = num_classes
        self.threshold = overlap_thresh
        self.background_label = bkg_label
        self.encode_target = encode_target
        self.use_prior_for_matching = prior_for_matching
        self.do_neg_mining = neg_mining
        self.negpos_ratio = neg_pos
        self.neg_overlap = neg_overlap
        self.variance = [0.1, 0.2]

    def forward(self, predictions, targets):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            and prior boxes from SSD net.
                conf shape: torch.size(batch_size,num_priors,num_classes)
                loc shape: torch.size(batch_size,num_priors,4)
                priors shape: torch.size(num_priors,4)
            targets (tensor): Ground truth boxes and labels for a batch,
                shape: [batch_size,num_objs,5] (last idx is the label).
        """
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)
        priors = priors[:loc_data.size(1), :]
        num_priors = (priors.size(0))
        num_classes = self.num_classes

        # match priors (default boxes) and ground truth boxes
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        for idx in range(num):
            truths = targets[idx][:, :-1].data
            labels = targets[idx][:, -1].data
            defaults = priors.data
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)
        loc_t = loc_t.to(self.device)
        conf_t = conf_t.to(self.device)
        # wrap targets
        loc_t = Variable(loc_t, requires_grad=False)
        conf_t = Variable(conf_t, requires_grad=False)

        pos = conf_t > 0
        num_pos = pos.sum(dim=1, keepdim=True)

        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # Compute max conf across batch for hard negative mining
        batch_conf = conf_data.view(-1, self.num_classes)
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))

        # Hard Negative Mining
        loss_c[pos.view(-1, 1)] = 0  # filter out pos boxes for now UPDATED
        loss_c = loss_c.view(num, -1)
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.view(1, -1).long().sum(1, keepdim=True) # UPDATED
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
        neg = idx_rank < num_neg.expand_as(idx_rank)

        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weighted = conf_t[(pos+neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + aLloc(x,l,g)) / N

        N = num_pos.data.sum().float() # UPDATED
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c