In [13]:
import torch
import torchvision
from torchvision import transforms as T
import cv2
torch.device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.device)

cpu


In [14]:
model = torchvision.models.detection.ssd300_vgg16(pretrained=True)
model.eval()



SSD(
  (backbone): SSDFeatureExtractorVGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=

In [2]:
classnames = []
with open ('classes.txt', 'r') as f:
    classnames = f.read().splitlines()
    print(classnames)

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'hair brush']


In [16]:
def load_image(image_path):
    image = cv2.imread(image_path)
    return image

def transform_image(image):
    img_transform = T.ToTensor()
    image_tensor = img_transform(image)
    return image_tensor

In [17]:
import time 
import entity

def detect_objects(model, image_tensor, entities, confidence_threshold=0.8):
    
    ## The entity system is going to determine if there are any known entities that were recently found
    ## within the frame. It'll use this to remove the entity scan areas from image_tensor, turning them into 
    ## zeros. This should improve the speed for the search for new entities.
    start_time = time.time()
    print(start_time)
    entities = {}
    if entities == {}:
        with torch.no_grad():
            y_pred = model([image_tensor])
        
        bbox, scores, labels = y_pred[0]['boxes'], y_pred[0]['scores'], y_pred[0]['labels']
        indices = torch.nonzero(scores > confidence_threshold).squeeze(1)
        
        filtered_bbox = bbox[indices]
        filtered_scores = scores[indices]
        filtered_labels = labels[indices]
        
        if len(filtered_bbox) == len(filtered_labels):
            new_entities = []
            for l, i in enumerate(filtered_labels):
                x, y, w, h = bbox[i].numpy().astype('int')
                class_label = classnames[l]
                if class_label not in entities.keys():
                    entities[class_label] = []
                entity = {"bbox":(x, y,w,h), "label":classnames[l]}
                print(entity)
                entities[class_label].append(entity)
        end_time = time.time()
        print(f"time taken: {end_time - start_time}")
        return filtered_bbox, filtered_scores, filtered_labels
    else:
        print("There are known entities")
    end_time = time.time()
    
    print(f"time taken: {end_time - start_time}")
    
    


In [18]:
#There is an issue with the system currently. It's not drawing the label.
def draw_boxes_and_labels(image, bbox, labels, class_names):
    img_copy = image.copy()
    
    for i in range(len(bbox)):
        x, y, w, h = bbox[i].numpy().astype('int')
        cv2.rectangle(img_copy, (x, y), (w, h), (0, 0, 255), 2)
        
        class_index = labels[i].numpy().astype('int')
        class_detected = class_names[class_index]
        try:
            cv2.putText(img_copy, class_detected, (x, y-10), cv2.FONT_HERSHEY_COMPLEX, .5, (0, 180, 0), 1, cv2.LINE_AA)
        except:
            continue        
    return img_copy


In [20]:

image_path = '../Visuals/test2.jpg'
img = load_image(image_path)
img_tensor = transform_image(img)
print(img_tensor.shape)
bbox, scores, labels = detect_objects(model, img_tensor, entities=[])

result_img = draw_boxes_and_labels(img, bbox, labels, classnames)


cv2.imshow("The Image", result_img)
cv2.waitKey()
cv2.destroyAllWindows()


torch.Size([3, 480, 640])
1706859711.6011868
{'bbox': (84, 351, 103, 400), 'label': 'person'}
{'bbox': (84, 351, 103, 400), 'label': 'bicycle'}
time taken: 0.4830169677734375


In [21]:
def determine_similar_entity(entity_image, scan_image):
    
    entity_image.resize_(3, 220, 220)
    scan_image.resize_(3, 220, 220)
    
    cosi = torch.nn.CosineSimilarity(dim=0)
    sim = cosi(entity_image, scan_image)
    sim = torch.mean(sim)
    if sim > .7:
        return True
    else:
        return False

In [22]:
import time
def capture_and_display_current_frame_rate(image, start_time):
    fr = round(1.0/(time.time() - start_time))
    image = cv2.putText(image, f"Frame Rate: {str(fr) if fr < 120 else 120}", (0, 15), cv2.FONT_HERSHEY_COMPLEX, .5, (0, 0, 0), 1, cv2.LINE_AA)
    return round(1.0 / (time.time() - start_time)), image

In [23]:
def general_image_detection_1(frame):
    ### This is our general image detection. It uses it's model to determine if any of those classes exist in the image.
    bbox, labels, classnames = detect_objects(model, transform_image(frame), entities, confidence_threshold=.8)
    frame = draw_boxes_and_labels(frame, bbox, labels, classnames)
    return frame
    

In [24]:
class UI_Display():
    def __init__(self):
        print("Initializing UI")
        self.flags = {"main UI settings":{"framerate":True}}
        
    def Display_Text(self, image, text, line , size=.5, color=(), thickness = 1):
        try:
            cv2.putText(image, text, (0, (15*line)), cv2.FONT_HERSHEY_COMPLEX, size, color, thickness, cv2.LINE_AA)
            return image
        except:
            print("Cannot display text. Unknown error")
            
    def UI_Update(self):
        for f in self.flags.keys():
            print("f=", f, type(f))
            if type(self.flags[f]) == dict:
                for sub in self.flags[f].keys():
                    print(sub)

In [25]:
### This one runs the same model through the webcam. It's slow. I do not like this. 

# import the opencv library 
import cv2 
  
# define a video capture object 
vid = cv2.VideoCapture(0) 
currentFrame = 0
entities = []
image_display_flags = {}
image_display_flags["general_0"] = True
entities = {}
display = UI_Display()
### This is where the magic starts. We'll get the metrics to determine our frame rate, and grab the frame from the webcam. 
### All the ai and post processing happens through this central process.
while(True): 
      
    # Capture the video frame 
    # by frame 
    startFrame = time.time()
    ret, frame = vid.read() 
    
    if image_display_flags["general_0"] == True:
        frame = general_image_detection_1(frame)
    
        
    # result_img = draw_boxes_and_labels(frame, bbox, labels, classnames)
    # cv2.putText(result_img, f"FPS: {round(1.0 / (time.time() - startFrame), 2)}", (0, 15), cv2.FONT_HERSHEY_COMPLEX, .5, (0, 0, 0), 1, cv2.LINE_AA)
    # cv2.putText(result_img, f"Current Frame: {currentFrame}", (0, 30), cv2.FONT_HERSHEY_COMPLEX, .5, (0, 0, 0), 1, cv2.LINE_AA)
    fr, frame = capture_and_display_current_frame_rate(frame, startFrame)
    display.UI_Update()
    cv2.imshow("Ai Object Detection 1", frame)
    currentFrame += 1
      
    # the 'q' button is set as the 
    # quitting button you may use any 
    # desired button of your choice 
    if cv2.waitKey(1) & 0xFF == ord('q'): 
        
        break
  
# After the loop release the cap object 
vid.release() 
# Destroy all the windows 
cv2.destroyAllWindows() 

Initializing UI
1706859730.2142894
time taken: 0.39067578315734863
f= main UI settings <class 'str'>
framerate
1706859730.6312456
time taken: 0.40299129486083984
f= main UI settings <class 'str'>
framerate
1706859731.0418806
time taken: 0.4328420162200928
f= main UI settings <class 'str'>
framerate
1706859731.4827251
time taken: 0.4147984981536865
f= main UI settings <class 'str'>
framerate
1706859731.9033546
time taken: 0.39026689529418945
f= main UI settings <class 'str'>
framerate
1706859732.3048549
time taken: 0.3917422294616699
f= main UI settings <class 'str'>
framerate
1706859732.7092512
time taken: 0.3651716709136963
f= main UI settings <class 'str'>
framerate
1706859733.0833528
time taken: 0.37475156784057617
f= main UI settings <class 'str'>
framerate
1706859733.4750752
time taken: 0.3693206310272217
f= main UI settings <class 'str'>
framerate
1706859733.86471
time taken: 0.35861706733703613
f= main UI settings <class 'str'>
framerate
1706859734.2407832
time taken: 0.34342980