In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
from PIL import Image

In [2]:
labelled_json = json.load(open('./data/labeled_images.json', 'r'))
labelled_json[0]

{'image': 'frame_0655.jpg',
 'points': [{'x': 523, 'y': 348}, {'x': 548, 'y': 347}]}

In [3]:
def build_dataset(json_data):
    X, Y = [], []

    count = 0
    for item in json_data[:800]:
        count += 1
        img = Image.open('./data/raw_images/' + item['image'])
        img = img.resize((round(16/9 * 128), 128))
        img_tensor = torch.tensor(list(img.getdata()), dtype=torch.float32).view(3, 128, round(16/9 * 128))
        
        X.append(img_tensor)
        point_tensor = torch.tensor([item['points'][0]['x'], item['points'][0]['y'], item['points'][1]['x'], item['points'][1]['y']], dtype=torch.float32)
        Y.append(point_tensor)

        if count % 100 == 0:
            print(count)
    
    X = torch.stack(X)
    Y = torch.stack(Y)
    return X, Y


In [4]:
X, Y = build_dataset(labelled_json)

100
200
300
400
500
600
700
800


In [5]:
X[0].shape

torch.Size([3, 128, 228])

In [6]:
class SimpleConvNet(nn.Module):
    def __init__(self):
        super(SimpleConvNet, self).__init__()

        # Define the convolutional layers
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Define fully connected layers
        self.fc1 = nn.Linear(32 * 32 * 57, 128)  # Adjust the input size based on your image dimensions
        self.relu3 = nn.ReLU()

        # Output layer
        self.fc2 = nn.Linear(128, 4)  # Output 4 logits

    def forward(self, x):
        # Input: (batch_size, channels, height, width)
        # Example: (batch_size, 3, 128, 228)

        # Convolutional layers
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        # Flatten the output for the fully connected layers
        x = x.view(x.size(0), -1)

        # Fully connected layers
        x = self.fc1(x)
        x = self.relu3(x)

        # Output layer
        x = self.fc2(x)

        return x

m = SimpleConvNet()

In [7]:
m = m.cuda()
X, Y = X.cuda(), Y.cuda()

In [8]:
# Define the loss function and optimizer
criterion = nn.MSELoss().cuda()
optimizer = optim.Adam(m.parameters(), lr=3e-4)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
lossi = []

In [135]:
# training loop
for epoch in range(1000):
    for i in range(10):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = m(X.float())
        loss = criterion(outputs, Y)
        loss.backward()
        optimizer.step()

        lossi.append(loss.item())

        # print statistics
        if i + epoch % 100 == 0:
            print(f"loss: {loss.item()}")

[1,    10] loss: 184978.620
[2,    10] loss: 73839.895
[3,    10] loss: 55015.146
[4,    10] loss: 46177.176
[5,    10] loss: 38832.037
[6,    10] loss: 31920.326
[7,    10] loss: 25686.871
[8,    10] loss: 20936.455
[9,    10] loss: 17437.223
[10,    10] loss: 14596.008
[11,    10] loss: 12053.584
[12,    10] loss: 10167.028
[13,    10] loss: 8196.679
[14,    10] loss: 7523.983
[15,    10] loss: 6024.914
[16,    10] loss: 4935.030
[17,    10] loss: 3987.459
[18,    10] loss: 3178.038
[19,    10] loss: 2526.537
[20,    10] loss: 2390.085
[21,    10] loss: 1759.755
[22,    10] loss: 1443.801
[23,    10] loss: 7613.203
[24,    10] loss: 3134.801
[25,    10] loss: 1834.161
[26,    10] loss: 2051.145
[27,    10] loss: 1650.855
[28,    10] loss: 1120.639
[29,    10] loss: 899.518
[30,    10] loss: 782.235
[31,    10] loss: 706.003
[32,    10] loss: 649.373
[33,    10] loss: 601.878
[34,    10] loss: 560.069
[35,    10] loss: 521.982
[36,    10] loss: 486.994
[37,    10] loss: 454.523
[38,  

In [136]:
m(X[0].unsqueeze(0))

tensor([[523.0009, 348.0005, 547.9999, 346.9999]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [146]:

img = Image.open('./test.jpg')
img = img.resize((round(16/9 * 128), 128))
# img = img.resize((1920, 1080), resample=Image.BILINEAR)
img.save('./test_resized.jpg')
img_tensor = torch.tensor(list(img.getdata()), dtype=torch.float32).view(3, 128, round(16/9 * 128))


In [147]:
m(img_tensor.unsqueeze(0).cuda())

tensor([[619.3729, 743.3245, 550.9766, 711.8602]], device='cuda:0',
       grad_fn=<AddmmBackward0>)