##### The libraries using in the project are
- opencv-python and opencv-contrib-python == 4.2.0
- numpy == 1.19.5
- os
- torchvision == 0.8.2+cu101
- torch == 1.7.1+cu101
- sklearn == 0.21.3

In [1]:
import os
import cv2 as cv
import numpy as np

def imshow(x: np.array):
    cv.imshow('img',x)
    cv.waitKey(0)
    cv.destroyAllWindows()

##### Task 1

- Target extraction of curling rock is based on Hough Circles with multiple parameters to encapsulate all of them and reduced overlaping bbox to one
- Extracted patches from target extraction are compared to the histogram of a base curling rock (red and yellow) based on that value if they pass a threshold are counted at inference

In [2]:


base_path = "test/Task1/"
files = os.listdir(base_path)
files = [x for x in files if '.png' in x ]
files = sorted(files, key=lambda x : int(x.split(".")[0]))

def extract_targets(x: str, base_path: str,debug : bool = False) -> list:
    targets = []
    img = cv.imread(base_path+x,0)
    cimg = cv.imread(base_path+x)
    # First pass for hough circles to contain with these parameters
    circles = cv.HoughCircles(img,cv.HOUGH_GRADIENT,1,20,
                                param1=40,param2=28,minRadius=12,maxRadius=24)
    if circles is not None:
        circles = np.uint16(np.around(circles))
        for i in circles[0,:]:
            x1,y1 = int(i[0])-int(i[2]),int(i[1])-int(i[2])
            x2,y2 = int(i[0])+int(i[2]),int(i[1])+int(i[2])
            # fail safe for bbox out of bounds
            if x1 <0:
                x1 = 0
            if x2 <0:
                x2 = 0
            if y1 <0:
                y1 = 0
            if y2 <0:
                y2 = 0
            targets.append([x1,y1,x2-x1,y2-y1])
            if debug:
                cv.rectangle(cimg,(x1,y1),(x2,y2),(0,255,0),2)
    # Second pass for hough circles to contain with these parameters
    img = cv.GaussianBlur(img,(5,5),0)
    circles = cv.HoughCircles(img,cv.HOUGH_GRADIENT,1,20,
                            param1=40,param2=28,minRadius=12,maxRadius=24)
    if circles is not None:
        circles = np.uint16(np.around(circles))
        for i in circles[0,:]:
            x1,y1 = int(i[0])-int(i[2]),int(i[1])-int(i[2])
            x2,y2 = int(i[0])+int(i[2]),int(i[1])+int(i[2])
            if x1 <0:
                x1 = 0
            if x2 <0:
                x2 = 0
            if y1 <0:
                y1 = 0
            if y2 <0:
                y2 = 0
            targets.append([x1,y1,x2-x1,y2-y1])
            if debug:
                cv.rectangle(cimg,(x1,y1),(x2,y2),(0,255,0),2)
    # Last pass for hough circles to contain with these parameters
    img = cv.imread(base_path+x,0)
    circles = cv.HoughCircles(img,cv.HOUGH_GRADIENT,1,90,
                                param1=35,param2=18,minRadius=15,maxRadius=25)
    if circles is not None:
        circles = np.uint16(np.around(circles))
        for i in circles[0,:]:
            x1,y1 = int(i[0])-int(i[2]),int(i[1])-int(i[2])
            x2,y2 = int(i[0])+int(i[2]),int(i[1])+int(i[2])
            if x1 <0:
                x1 = 0
            if x2 <0:
                x2 = 0
            if y1 <0:
                y1 = 0
            if y2 <0:
                y2 = 0
            targets.append([x1,y1,x2-x1,y2-y1])
            if debug:
                cv.rectangle(cimg,(x1,y1),(x2,y2),(0,255,0),2)
    if debug:
        imshow(cimg)
    return cv.groupRectangles(targets,1,0.4)
def calc_similarity(patch):
    
    ball1 = cv.imread('color1.png')
    ball1 = cv.resize(ball1,(45,45))
    ball2 = cv.imread('color2.png')
    ball2 = cv.resize(ball2,(45,45))
    
    hist_base = cv.calcHist([patch], [0, 1, 2], None, [4, 4, 4],[0, 256, 0, 256, 0, 256])
    hist_base = cv.normalize(hist_base, hist_base).flatten()

    hist_1 = cv.calcHist([ball1], [0, 1, 2], None, [4, 4, 4],[0, 256, 0, 256, 0, 256])
    hist_1 = cv.normalize(hist_1, hist_1).flatten()

    hist_2 = cv.calcHist([ball2], [0, 1, 2], None, [4, 4, 4],[0, 256, 0, 256, 0, 256])
    hist_2 = cv.normalize(hist_2, hist_2).flatten()

    val1 = cv.compareHist(hist_base, hist_1, 0)
    val2 = cv.compareHist(hist_base, hist_2, 0)
    return val1,val2
def infer_similarity(base_path: str, x: str) -> list:
    yellow = 0 
    red = 0
    img = cv.imread(base_path+x)
    for y in extract_targets(x,base_path,False)[0]:
        #extract patch
        image = img[y[1]:y[1]+y[3],y[0]:y[0]+y[2],:]
        #calculate similarity between patch and curling ball1 and ball2
        val1,val2 = calc_similarity(image)
        if val1 > 0.50 or val2 > 0.50:
            if val1 > 0.49:
                yellow+=1
            else:
                red+=1
    return [red+yellow,red,yellow]

In [8]:
# For accuracy calculation
correct = 0
for x in files:
    file = open(base_path+"ground-truth/"+x.replace(".png",'_gt.txt'))
    out = [int(x.replace("\n"," ")) for x in file.readlines()]
    #print(out,infer_similarity(base_path,x))
    correct+=out == infer_similarity(base_path,x)
    file.close()
print(correct)

24


In [6]:
# Write predictions
for idx,x in enumerate(files):
    outstring = " ".join(str(x) for x in infer_similarity(base_path,x)).replace(" ","\n")
    #print(outstring)
    file = open('evaluation/submission_files/Dumitrascu_Claudiu_Cristian_407/Task1/'+str(idx+1)+"_predicted.txt",'w')
    file.writelines(outstring)
    file.close()

##### Task 2
- For application we use the last frame from a video
- Target extraction of curling rock is based on Hough Circles for the button part and expande it based on its radius
- Extracted patches from target extraction are compared to the histogram of a base curling rock (red and yellow) based on that value if they pass a threshold are counted at inference and we take the bounding circles and see in which circle they are in, being sorted from button to the last circle, we use the function circle to see in which circle is present in

In [3]:
base_path = "test/Task2/"
files = os.listdir(base_path)
files = [x for x in files if '.mp4' in x ]
files = sorted(files, key=lambda x : int(x.split(".")[0]))

def get_last_frame(video_path):
    """
    This function takes the video path and returns the last frame.
    :param video_path: Path to the video
    """
    frames = []
    cap = cv.VideoCapture(video_path)  
    count = cap.get(cv.CAP_PROP_FRAME_COUNT)
    cap.set(cv.CAP_PROP_POS_FRAMES,count-1)
    if cap.isOpened() == False: 
        raise Exception("Error opening video stream or file") 
        return frames
    while cap.isOpened():  
        ret, frame = cap.read() # Read the frame
        if ret is True:
            frames.append(frame)
        else:
            break
    cap.release()
    return frames[0]

def circle(x1, y1, x2, y2, r1, r2):
    # calculate if two circle overlap of it contains it
    distSq = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2);
    radSumSq = (r1 + r2) * (r1 + r2);
    if (distSq == radSumSq):
        return 1
    elif (distSq > radSumSq):
        return -1
    else:
        return 0
    
def infer_score(base_path: str, x: str, debug: bool = False) -> list:
    # extract the button and then infer the rest of the circles
    score = [0,0]
    img = get_last_frame(base_path+x)
    cv.imwrite("temp.png",img)
    img = cv.GaussianBlur(img,(5,5),0)
    gray = cv.cvtColor(img,cv.COLOR_BGR2GRAY)
    circles = cv.HoughCircles(gray,cv.HOUGH_GRADIENT,1,60,
                                param1=35,param2=45,minRadius=50,maxRadius=135)
    if circles is not None:
        circles = np.uint16(np.around(circles))
        for i in circles[0,:]:
            c1 = [i[0],i[1],i[2]//2-12]
            c2 = [i[0],i[1],i[2]]
            c3 = [i[0],i[1],2*i[2]-5]
            c4 = [i[0],i[1],3*i[2]-10]
            center = [i[0],i[1]]
            if debug:
                cv.circle(img,(i[0],i[1]),i[2]//2-12,(0,255,0),2)
                cv.circle(img,(i[0],i[1]),i[2],(0,255,0),2)
                cv.circle(img,(i[0],i[1]),2*i[2]-5,(0,255,0),2)
                cv.circle(img,(i[0],i[1]),3*i[2]-10,(0,255,0),2)
    rocks = extract_targets("temp.png",'./')
    # sort circles by distance from button center
    for y in sorted(rocks[0],key=lambda x: np.sqrt((center[0] - x[0])**2 + (center[1] - x[1])**2)):        
        image = img[y[1]:y[1]+y[3]-2,y[0]:y[0]+y[2]-2,:]
        yellow = 0
        red = 0
        val1,val2 = calc_similarity(image)
        was_button = 0
        was_second = 0
        # calculating score based on rules
        if val1 > 0.50 or val2 > 0.50:
            if val1 > 0.49:
                yellow=1
            else:
                red=1
            y_circ1,y_circ2,y_r = y[0]+y[2]//2 +1, y[1]+y[2]//2 +1, y[2]//2
            first = circle(c4[0],c4[1],y_circ1,y_circ2,c4[2],y_r)
            if first != -1:
                button = circle(c1[0],c1[1],y_circ1,y_circ2,c1[2],y_r)
                third = circle(c2[0],c2[1],y_circ1,y_circ2,c2[2],y_r)
                second = circle(c3[0],c3[1],y_circ1,y_circ2,c3[2],y_r)
                if button != -1:
                    was_button = 1
                    if yellow:
                        score[0]+=1
                    else:
                        score[1]+=1
                elif second != -1:
                    was_second=1
                    if yellow and score[1] == 0:
                        score[0]+=1
                    elif yellow and was_button == 0:
                        score[0]= 1
                        score[1]= 0
                    elif score[0] ==0:
                        score[1]+= 1
                else:
                    if yellow and score[1] == 0:
                        score[0]+=1
                    elif yellow and was_second == 0 and was_button == 0:
                        score[0]= 1
                        score[1]= 0
                    elif score[0] ==0:
                        score[1]+= 1
                if debug:
                    cv.circle(img,(y_circ1,y_circ2),y_r,(0,0,255),3)
                    imshow(img)
    score.reverse()
    if debug:
        imshow(img)
    return score

In [13]:
# calculate accuracy
correct = 0
for x in files:
    file = open(base_path+"ground-truth/"+x.replace(".mp4",'_gt.txt'))
    out = [int(x.replace("\n"," ")) for x in file.readlines()]
    #print(out,infer_similarity(base_path,x))
    correct+=out == infer_score(base_path,x)
    file.close()
print(correct)

11


In [9]:
# prediction
for idx,x in enumerate(files):
    outstring = " ".join(str(x) for x in infer_score(base_path,x)).replace(" ","\n")
    #print(outstring)
    file = open('evaluation/submission_files/Dumitrascu_Claudiu_Cristian_407/Task2/'+str(idx+1)+"_predicted.txt",'w')
    file.writelines(outstring)
    file.close()

#### Task 3
The method is based on CSRT tracker from Opencv in which takes the first bbox of the object and then is tracked along the way, in case of failing for the tracking, a new tracker is created with the curling rock extracted based on method 1 and then given as the new bbox for the tracker

In [4]:
base_path = "test/Task3/"
files = os.listdir(base_path)
files = [x for x in files if '.mp4' in x ]
files = sorted(files, key=lambda x : int(x.split(".")[0]))

def get_all_frame(video_path):
    """
    This function takes the video path and returns the a list of frames.
    :param video_path: Path to the video
    """
    frames = []
    cap = cv.VideoCapture(video_path)  
    if cap.isOpened() == False: 
        raise Exception("Error opening video stream or file") 
        return frames
    while cap.isOpened():  
        ret, frame = cap.read() # Read the frame
        if ret is True:
            frames.append(frame)
        else:
            break
    cap.release()
    return frames

def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou

# tracker picker to test multiple 
def create_tracked(x=7):
    tracker_types = ['BOOSTING', 'MIL','KCF', 'TLD', 'MEDIANFLOW', 'GOTURN', 'MOSSE', 'CSRT']
    tracker_type = tracker_types[x]
    if tracker_type == 'BOOSTING':
        tracker = cv.TrackerBoosting_create()
    if tracker_type == 'MIL':
        tracker = cv.TrackerMIL_create()
    if tracker_type == 'KCF':
        tracker = cv.TrackerKCF_create()
    if tracker_type == 'TLD':
        tracker = cv.TrackerTLD_create()
    if tracker_type == 'MEDIANFLOW':
        tracker = cv.TrackerMedianFlow_create()
    if tracker_type == 'GOTURN':
        tracker = cv.TrackerGOTURN_create()
    if tracker_type == 'MOSSE':
        tracker = cv.TrackerMOSSE_create()
    if tracker_type == "CSRT":
        tracker = cv.TrackerCSRT_create()
    return tracker
correct = 0 
predict = []
for idx,x in enumerate(files):
    bounding_box = []
    debug=False
    all_frames = get_all_frame(base_path+x)
    tracking = open(base_path+x.replace(".mp4",'.txt'))
    out = [x.replace("\n","").split() for x in tracking.readlines()][-1]
    out = [int(x) for x in out]
    tracking.close()
    tracker = create_tracked(7)
    #init trackerde
    ok = tracker.init(all_frames[0], (out[1],out[2],out[3]-out[1],out[4]-out[2]))
    tracking.close()
    bad = 0
    for count,frame in enumerate(all_frames):
        # Update tracker
        ok, bbox = tracker.update(frame)
        bbox = np.int32(np.round(bbox))

        # Draw bounding box
        if ok:
            # Tracking success
            p1 = (int(bbox[0]), int(bbox[1]))
            p2 = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3]))
            if debug:
                cv.rectangle(frame, p1, p2, (255,0,0), 2, 1)
        else:
            tracker = create_tracked(7)
            #print('fail')
            cv.imwrite("temp.png",frame)
            for y in extract_targets("temp.png",'./',False)[0]:
                image = frame[y[1]:y[1]+y[3],y[0]:y[0]+y[2],:]
                val1,val2 = calc_similarity(image)
                if val1 > 0.50 or val2 > 0.50:
                    ok = tracker.init(frame, (y[0],y[1],y[2],y[3]))
                    ok, bbox = tracker.update(frame)
                    bbox = np.int32(np.round(bbox))
                    # Draw bounding box
                    if ok:
                        # Tracking success
                        p1 = (int(bbox[0]), int(bbox[1]))
                        p2 = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3]))
                        if debug:
                            cv.rectangle(frame, p1, p2, (255,0,0), 2, 1)
                break
        # Display result
        if debug:
            cv.imshow("Tracking", frame)
        predict = [count,bbox[0],bbox[1],bbox[0]+bbox[2],bbox[1]+bbox[3]]
        bounding_box.append(predict)
        if debug:
            # Exit if ESC pressed
            if count == len(all_frames)-3:
                cv.waitKey(0)
                cv.destroyAllWindows()
                break
            k = cv.waitKey(1) & 0xff
            if k == 27 : 
                cv.destroyAllWindows()
                break
                
                
    # prediction writing
    base_string = str(len(all_frames)) +" -1 -1 -1 -1\n"
    for x in bounding_box:
        if x == bounding_box[-1]:
            base_string+=str(x[0])+" "+str(x[1])+" "+str(x[2])+" "+str(x[3])+" "+str(x[4])
        else:
            base_string+=str(x[0])+" "+str(x[1])+" "+str(x[2])+" "+str(x[3])+" "+str(x[4])+"\n" 
    file = open('evaluation/submission_files/Dumitrascu_Claudiu_Cristian_407/Task3/'+str(idx+1)+"_predicted.txt",'w')
    file.writelines(base_string)
    file.close()

KeyboardInterrupt: 

### Task 4

This one is going to prove a little bit difficult, we have 10 files in which at the start of the frame the curling rock is not present and can appear any time so we cannot use something that we used previously the solution would be a pattern matching for mutiple views of the curling rock untill a part of the video and apply what we did for task 1,2 and 3 from there.

In [5]:
base_path = "test/Task4/"
files = os.listdir(base_path)
files = [x for x in files if '.mp4' in x ]
files = sorted(files, key=lambda x : int(x.split(".")[0]))

total = 0
frames = []
labels = []
for idx,x in enumerate(files):
    all_frames = get_all_frame(base_path+x)
    frames.append(all_frames)
    total+=len(all_frames)
    tracking = open('train/Task4/'+"ground-truth/"+x.replace(".mp4",'_gt.txt'))
    out = [x.replace("\n","").split() for x in tracking.readlines()]
    out = [[int(y) for y in x ]for x in out]
    labels.append(out)
print("Total amount of frames",total)

if len(labels) == len(frames):
    print("something wrong")
    
new_labels = []
no_object = [0,0,0,0,0]
for i,label in enumerate(labels):
    label = label[1:]
    for idx,x in enumerate(frames[i]):
        no_object = [0,0,0,0,0]
        #print(label[idx])
        if idx+1 == len(frames[i])-1:
            break
        if idx+1 > len(labels)-2:
            label.insert(idx+1,no_object)
        if label[idx+1][0] != idx+1:
            no_object[0] = idx+1
            label.insert(idx+1,no_object)
    if len(frames[i]) == len(label):
        print("good")
        new_labels.append(label)
    else:
        print("bad")
all_frames = []
for x in frames:
    for y in x:
        all_frames.append(y)
labels = []
for x in new_labels:
    for y in x:
        labels.append(y)
        
print(len(all_frames),len(labels))

Total amount of frames 6802
something wrong
bad
bad
bad
bad
bad
bad
bad
bad
bad
bad
6802 0


Deep learning model for object detection

In [6]:
import torch.nn as nn
import torch
import torch.nn.functional as F

In [7]:
from sklearn.model_selection import train_test_split
# spliting for train,validation
x_train,x_test,y_train,y_test = train_test_split(all_frames, np.zeros((6802,4)), test_size=0.20, random_state=1)

In [8]:
from torch.utils.data import Dataset

class DataClass(Dataset):
    def __init__(self, data, labels, transform=None):
        """
        :param data: data list with features
        :param labels: label list
        :param transform: Applying preprocessing on the data
        """
        self.df = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx: int) -> (torch.tensor, torch.tensor):
        # resize for memory limitation
        sample =torch.tensor(cv.resize(self.df[idx],(0,0),fx=0.25,fy=0.25))
        label = torch.Tensor(self.labels[idx][1:])
        sample = sample.permute(2,1,0)
        if self.transform:
            sample = self.transform(sample)
        return sample, label

In [9]:
len(x_train)

5441

In [10]:
len(x_test)

1361

In [11]:
from torch.utils.data import DataLoader
from torchvision import transforms

train_loader = DataLoader(DataClass(x_train,torch.zeros((5441,5)),None),batch_size=16,shuffle=True)
test_loader = DataLoader(DataClass(x_test,torch.zeros((1361,5)),None),batch_size=16,shuffle=True)

In [12]:
import torchvision.models as models

# model based on inception_v3 and 2 fully connected with sigmoid output for coordinates
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.backbone = models.inception_v3(pretrained=True, aux_logits=False,transform_input =True)
        self.linear1 = nn.Linear(1000,500)
        self.linear2 = nn.Linear(500,4)

    def forward(self, x: torch.Tensor):
        x = F.relu(self.backbone(x))
        #print(x)
        x = F.dropout(x,0.2)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return torch.sigmoid(x)

In [13]:
model = CustomModel().cuda()
optim = torch.optim.Adam(model.parameters(),lr=1e-4) #optimizer
loss = nn.L1Loss() #classification loss

In [14]:
def train(epoch,train_loader, optim, loss,network):
    batch_interval=50
    for batch_idx,(x,y) in enumerate(train_loader):
        # we do this step to transform labels of bbox to sigmoid values
        y[:,0] = y[:,0]/ 1280
        y[:,1] = y[:,1]/ 720
        y[:,2] = y[:,2]/ 1280
        y[:,3] = y[:,3]/ 720
        x = x.cuda()
        y = y.cuda()
        optim.zero_grad()

        output = network(x)

        loss_value = loss(output, y)

        loss_value.backward()

        optim.step()
        if batch_idx % batch_interval == 0:
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch, batch_idx * len(x), len(train_loader.dataset),
                               100. * batch_idx / len(train_loader), loss_value.item()))
    print('Train Epoch: {} Length {} \tLoss: {:.6f}'.format(epoch, len(train_loader), loss_value.item()))
    
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = torch.max(boxA[0], boxB[0])
    yA = torch.max(boxA[1], boxB[1])
    xB = torch.min(boxA[2], boxB[2])
    yB = torch.min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    if xB - xA + 1  > 0 and yB - yA + 1 > 0:
        interArea = (xB - xA + 1) *  (yB - yA + 1)
    elif  xB - xA + 1 < 0 or yB - yA + 1 < 0:
        interArea = torch.tensor(0)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the intersection area
    iou = interArea / (boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

def test(epoch,test_loader,network,loss):
    mse= 0 
    iou_score = 0
    with torch.no_grad():
        for data, target in test_loader:
            target = target.cuda()
            data = data.cuda()

            outputs = network(data)
            
            test_loss = loss(outputs, target)
            # we transform outputs to bbox coordinates and calculate if IOU is higher then 0.20
            outputs[:,0] = outputs[:,0]* 1280
            outputs[:,1] = outputs[:,1]* 720
            outputs[:,2] = outputs[:,2]* 1280
            outputs[:,3] = outputs[:,3]* 720
            good=0
            for bbox1,bbox2 in zip(target,outputs):
                intersect = bb_intersection_over_union(bbox1,bbox2)
                if intersect > 0.20:
                    good+=1
            mse += F.mse_loss(outputs,target)
            iou_score +=good
    print('Test Epoch: {} Length {} \tLoss: {:.6f}'.format(epoch, len(test_loader), test_loss.item()))
    print("MSE - ", mse)
    print("IOU with score > 0.20 ",iou_score," total ",len(test_loader)*16)

Training using as loss L1Loss which is good for anomalies like the curling rock not being present and optimizer Adam with lre 1e-4, the model was trained on approx 6000 frames with labels of the bounding box of the curling rock, some labels we're added to train the model to say if the curling rock is present with the normal label or with [0,0,0,0] if not present

In [60]:

for e in range(12):
    train(e,train_loader,optim,loss, model)
    test(e,test_loader,model,loss)

Train Epoch: 0 Length 300 	Loss: 0.047535
Test Epoch: 0 Length 75 	Loss: 0.024249
MSE -  tensor(645929.8750, device='cuda:0')
IOU with score > 0.20  577  total  1200
Train Epoch: 1 Length 300 	Loss: 0.018565
Test Epoch: 1 Length 75 	Loss: 0.018074
MSE -  tensor(566694.1250, device='cuda:0')
IOU with score > 0.20  843  total  1200
Train Epoch: 2 Length 300 	Loss: 0.121511
Test Epoch: 2 Length 75 	Loss: 0.038285
MSE -  tensor(225783.6562, device='cuda:0')
IOU with score > 0.20  782  total  1200
Train Epoch: 3 Length 300 	Loss: 0.026148
Test Epoch: 3 Length 75 	Loss: 0.020408
MSE -  tensor(257150.2500, device='cuda:0')
IOU with score > 0.20  882  total  1200
Train Epoch: 4 Length 300 	Loss: 0.014671
Test Epoch: 4 Length 75 	Loss: 0.066369
MSE -  tensor(303010.7188, device='cuda:0')
IOU with score > 0.20  909  total  1200
Train Epoch: 5 Length 300 	Loss: 0.013981
Test Epoch: 5 Length 75 	Loss: 0.047489
MSE -  tensor(171316.4062, device='cuda:0')
IOU with score > 0.20  895  total  1200
Trai

We created a model based on inception_v3 for object detection on frames which achieves the accuracy 92% accuracy on IOU at least 20%

In [66]:
# We save the model to have it for predictions
#torch.save(model.state_dict(), "./task4.pth")

In [15]:
# loading the pretrained model
model.load_state_dict(torch.load("./task4.pth"))

<All keys matched successfully>

Write prediction

In [19]:
model.eval()
all_pred = []
with torch.no_grad():
    for x in frames:
        #print(x[:5][:5])
        #print(y[:5])
        pred = []
        all_loader = DataLoader(DataClass(x, torch.zeros((len(x),5)),None),batch_size=16)
        for idx,(data,target) in enumerate(all_loader):
            data = data.cuda()
            out = model(data)
            out[:,0] = out[:,0]* 1280
            out[:,1] = out[:,1]* 720
            out[:,2] = out[:,2]* 1280
            out[:,3] = out[:,3]* 720
            out = torch.round(out).int().detach().cpu().numpy()
            pred.append(out)
        all_pred.append(np.vstack(pred))

In [20]:
all_pred[0][:15]

array([[576, 500, 722, 632],
       [549, 508, 711, 651],
       [548, 508, 705, 643],
       [556, 518, 724, 651],
       [563, 509, 711, 644],
       [556, 508, 724, 651],
       [553, 505, 728, 645],
       [559, 515, 731, 651],
       [558, 507, 728, 644],
       [552, 509, 726, 654],
       [558, 511, 728, 643],
       [561, 518, 733, 653],
       [556, 511, 718, 650],
       [560, 509, 720, 641],
       [564, 508, 718, 643]])

In [33]:
# Submission
for idx,bbox in enumerate(all_pred):
    base_string = str(len(bbox)) +" -1 -1 -1 -1\n"
    for i,x in enumerate(bbox):
        #print(bbox[-1])
        #print(x)
        if sum(x) < 25:
            continue
        if x is bbox[0][-1]:
            base_string+=str(i)+" "+str(x[0])+" "+str(x[1])+" "+str(x[2])+" "+str(x[3])
        else:
            base_string+=str(i)+" "+str(x[0])+" "+str(x[1])+" "+str(x[2])+" "+str(x[3])+"\n" 
    file = open('evaluation/submission_files/Dumitrascu_Claudiu_Cristian_407/Task4/'+str(idx+1)+"_predicted.txt",'w')
    file.writelines(base_string)
    file.close()

In [34]:
# For vizualization

for item1,item2 in zip(frames,all_pred):
    for count,(x,y) in enumerate(zip(item1,item2)):
        cv.rectangle(x, (y[0],y[1]), (y[2],y[3]), (255,0,0), 2, 1)    
        #if debug:
        cv.imshow("Tracking", x)
        #predict = [count,bbox[0],bbox[1],bbox[0]+bbox[2],bbox[1]+bbox[3]]
        #bounding_box.append(predict)
        #if debug:
        # Exit if ESC pressed
        if count == len(all_frames)-3:
            cv.waitKey(0)
            cv.destroyAllWindows()
            break
        k = cv.waitKey(1) & 0xff
        if k == 27 : 
            cv.destroyAllWindows()
            break