In [None]:
# import required libraries
import torch
from PIL import Image, ImageDraw
import os
import matplotlib.pyplot as plt
import cv2 as cv
import time
import numpy as np 
import torchvision.models as models
from PIL import Image
import torch.nn as nn
import json
from torch.utils.data import DataLoader

In [None]:
def setDevice():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using {device}")
    return device

In [None]:
def loadYoloModel():
    model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
    return model

In [None]:
def detectSingleImage(imagePath, device, model):
    start = time.time()

    image = Image.open(imagePath).convert('RGB')
    with torch.no_grad():
        result = model(image)
        result.print()
        #print(result.xyxy)
        #print("Names")
        #print(result.pandas().xyxy[0]["name"])
        resultNames = result.pandas().xyxy[0]["name"]
        predictions = result.xyxy[0]

        for i in range(len(predictions)):
            prediction = predictions[i].tolist()
            # print(prediction)
            #print(f"prediction {i}, {resultNames[i]}")
            #print(prediction)
            
            
            if resultNames[i] == "person":
                indexOfClosest = None
                distance = image.size[0]
                for j in range(len(predictions)):
                    prediction2 = predictions[j].tolist()

                    if resultNames[j] == "person":
                        continue
                    else:
                        #print("prediction for non human object")
                        #print(prediction2)

                        # pil image pixel value changing
                        temp = np.array(image)

                        # debug (draw one pixel green)
                        #print("coordinates")
                        #print(round(prediction2[0]))
                        #print(round(prediction2[1]))
                        temp[round(prediction2[1])][round(prediction2[0])]   = [0, 255, 0]
                        image = Image.fromarray(temp)
                        # debug

                        personCenters = (
                            (prediction[2] + prediction[0]) // 2, # (xmax + xmin) / 2  -> x but column value
                            (prediction[3] + prediction[1]) // 2  # (ymax + ymin) / 2  -> y but row value
                        )

                        objCenters = (
                            (prediction2[2] + prediction2[0]) // 2, # (xmax + xmin) / 2  -> x but column value
                            (prediction2[3] + prediction2[1]) // 2  # (ymax + ymin) / 2  -> y but row value
                        )
                        ImageDraw.Draw(image).line((personCenters[0],personCenters[1], objCenters[0], objCenters[1] ), fill="red", width=3)

                        radius = 4
                        ImageDraw.Draw(image).ellipse((personCenters[0]-radius,personCenters[1]-radius,personCenters[0]+radius, personCenters[1]+radius), fill="red", outline="red")
                        ImageDraw.Draw(image).ellipse((objCenters[0]-radius,objCenters[1]-radius,objCenters[0]+radius, objCenters[1]+radius), fill="red", outline="red")


            x_min = round(prediction[0])
            y_min = round(prediction[1])
            x_max = round(prediction[2])
            y_max = round(prediction[3])
            cfdnc = prediction[4]
            if cfdnc < 0.4 and resultNames[i] != "person":
                continue
            color = None
            if resultNames[i] == "person":
                color = "blue"
            else:
                color = "green"  
            ImageDraw.Draw(image).rectangle([x_min,y_min,x_max,y_max], outline =color, width=3)
    
    print(f"Took {time.time()-start} seconds.")
    return image

In [None]:
# set device
device = setDevice()

In [None]:
# loading pretrained model
yoloModel = loadYoloModel()
yoloModel

In [None]:
resultImage = detectSingleImage("./dataset/train2015/train/HICO_train2015_00000015.jpg", device, yoloModel)
resultImage

In [None]:
def readAnnotationFiles():
    trainJSONFile = open('./annotations/train_data_with_obj_id.json')
    testJSONFile = open("./annotations/test_data_with_obj_id.json")
    verbJSONFile = open('./annotations/verb_dict.json')
    verbFile = open('./annotations/hico_list_vb.txt',"r")
    hoiFile = open('./annotations/hico_list_hoi.txt', "r")   
    objFile = open('./annotations/hico_list_obj.txt', "r")

    # reading and configuring txt files
    verbFileArray = verbFile.read().split("\n")
    hoiFileArray = hoiFile.read().split("\n")
    objFileArray = objFile.read().split("\n")
    
    for i in range(len(objFileArray)):
        objFileArray[i] = objFileArray[i].strip(" ").split(" ")[-1]
    
    # loading json files
    trainAnnotationDictionary = json.load(trainJSONFile)
    verbDictionary = json.load(verbJSONFile)
    testAnnotationDictionary = json.load(testJSONFile)

    # closing files which has opened
    trainJSONFile.close()
    verbJSONFile.close()
    verbFile.close()
    hoiFile.close()
    objFile.close()
    testJSONFile.close()
    
    return trainAnnotationDictionary, testAnnotationDictionary, verbDictionary, verbFileArray, hoiFileArray, objFileArray


def targetVectorOfAllImages(AnnotationDictionary, verbDictionary, verbFileArray, hoiFileArray):
    imageNames = AnnotationDictionary["name"]
    actionNumbers = AnnotationDictionary["action_no"]
    objectList = AnnotationDictionary["obj_list"]
    objectIDs = AnnotationDictionary["obj_id"]
    
    verbArray = list(verbDictionary.values())

    targetVectors = []
    for i in imageNames:
        actionIndexes = actionNumbers[i]
        targetVector = np.zeros(117)

        for j in actionIndexes:
            # getting relevant verb
            tempVerb = hoiFileArray[j - 1].strip(" ").split(" ")[-1]

            # getting index of verb
            tempIndex = verbArray.index(tempVerb)

            # setting correct values
            targetVector[tempIndex] = 1

            print([imageNames[i]], objectList[i], [tempVerb])
        
        targetVectors.append(targetVector)
        
    return targetVectors
 

def getIndividualTargetVector(imageName, AnnotationDictionary, verbDictionary, verbFileArray, hoiFileArray, objFileArray):
    imageNames = AnnotationDictionary["name"]
    actionNumbers = AnnotationDictionary["action_no"]
    objectList = AnnotationDictionary["obj_list"]
    objectIDs = AnnotationDictionary["obj_id"]
    
    verbArray = list(verbDictionary.values())
    imageNameArray = list(imageNames.values())
    
    
    # extract image index
    imageIndex = imageNameArray.index(imageName)
    
    
    # only image names
    imageNamesList = list(imageNames.values())
    
    #initial target vector
    targetVector = np.zeros(117)
    
    # extract action indexes 
    actionIndexes = list(actionNumbers.values())[imageIndex]
    
    
    # convert object list 
    objectList = list(objectList.values())
    for i in actionIndexes:
        verb = hoiFileArray[i - 1].strip(" ").split(" ")[-1]
        
        tempIndex = verbArray.index(verb)
        
        targetVector[tempIndex] = 1
        
        #obj = objectList[imageIndex - 1].split(" ")[-1]
        objIndex = objFileArray.index(objectList[imageIndex - 1][0])
        
        #print([imageNamesList[imageIndex - 1]], objectList[imageIndex - 1], [verb])
        #print(imageIndex - 1)
        #print(objIndex)
    
    return [objIndex, targetVector]   



In [None]:
"""read needed files and configure"""
trainAnnotationDictionary, testAnnotationDictionary, verbDictionary, verbFileArray, hoiFileArray, objFileArray = readAnnotationFiles()

"""create target vector for all the images"""
# targetVectorOfAllImages(trainAnnotationDictionary, verbDictionary, verbFileArray, hoiFileArray

"""return target vector of given image"""

targetVector = getIndividualTargetVector("HICO_train2015_00000016.jpg",trainAnnotationDictionary,verbDictionary, verbFileArray, hoiFileArray, objFileArray)
import torch
import torch.nn as nn

import torchvision
import torchvision.transforms as transforms

from torchsummary import summary


print(targetVector)  

In [None]:
import torch
import torch.nn as nn

import torchvision
import torchvision.transforms as transforms

import os
import numpy as np
import pandas as pd
from PIL import Image
from time import time
from matplotlib import pyplot as plt
from IPython.display import display


dataset_path = './dataset/train2015'

mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float32)
std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float32)
transform = transforms.Compose([
    
    transforms.Resize(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])



dataset = torchvision.datasets.ImageFolder(dataset_path, transform=transform)


myDataset = []


for i in range(len(dataset.imgs)):
    
    tempImageName = dataset.imgs[i][0].split("\\")[-1]
    targetVector = getIndividualTargetVector(tempImageName,trainAnnotationDictionary,verbDictionary, verbFileArray, hoiFileArray, objFileArray)
    dataset.imgs[i] = (dataset.imgs[i][0], targetVector)
    """
    if (i % 1000 == 999):
        break # burayı boz
    """
dataset.classes = list(verbDictionary.values())

print("hawli")    



In [None]:
import torch.optim as optim
import torch.nn.functional as F1


class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.linear1 = nn.Linear(512, 512)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(512, 117)

        self.linear3 = nn.Linear(512,80)   


    def forward(self, x):
        residual = x
        out = F1.softmax(self.linear1(x))
        out = F1.softmax(self.linear1(out))
        out = self.linear2(x+residual)

        out2 = self.linear3(x)

        return out, out2



model = models.resnet18(pretrained=True)
model.fc = nn.Sequential()
myModel = MyModel().to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(myModel.parameters(),lr= 0.001)

In [None]:


epoch = 50

criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.MSELoss()

optimizer = optim.Adam(myModel.parameters(),lr= 0.001)

train_dataloader = DataLoader(dataset=dataset, batch_size = 16, shuffle = False)



for i in range(epoch):
    
    epoch_loss = []
    loss = 0
    
    for j, batch in enumerate(train_dataloader, 1):
        
        
        minput = batch[0]
        target = batch[1]
        
        moutput = model(minput.to(device))
        moutput = myModel(moutput.to(device))
        
        
        #loss2 = criterion2(moutput[0], target[1]).to(torch.float32)
        loss2 = F1.mse_loss(moutput[0].float(), target[1].float()).to(torch.float32)
        
        #loss1 = criterion1(moutput[1], target[0]).to(torch.float32)
        loss1 = F1.cross_entropy(moutput[1].float(), target[0]).to(torch.float32)
        
        """loss2 = criterion1(moutput[1], target[0])

        loss1 = criterion2(moutput[0], target[1][0])"""
        
        loss = loss1 + loss2
        
        print("loss", loss.item(), j)
        
        """ 
        print("predict_class, real_class", list(moutput[1][0]).index(max(moutput[1][0])), target[0][0])
        print(" ")
        
        """

        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
    print("Epoch Finished", i)
        
        
        
