In [None]:
!pip install fastai -q

In [5]:
from fastai import basics
from fastai.vision.all import *




In [6]:
path = Path('/kaggle/input/gaze-points')
path.ls()

(#6) [Path('/kaggle/input/gaze-points/30-21-07-08_1255_458'),Path('/kaggle/input/gaze-points/54-53-14-30_2530_68'),Path('/kaggle/input/gaze-points/31-13-06-58_3440_1440_705_1422'),Path('/kaggle/input/gaze-points/31-17-03-53_2927_453'),Path('/kaggle/input/gaze-points/30-17-55-03_2798_1000'),Path('/kaggle/input/gaze-points/59-59-13-30_1609_410')]

In [8]:
len(get_image_files(path))

6731

In [21]:
def get_coords(file: Path):
    fname = file.name.split('_')[1:]
    x = tensor(int(fname[0])).float()
    y = tensor(int(fname[1].replace('.png', ''))).float()
    

    return tensor([x, y]) / tensor([3440., 1440.])


In [22]:
image_files = get_image_files(path)

get_coords(image_files[0])

tensor([0.8288, 0.1819])

In [31]:
dbock = DataBlock(
    blocks=(ImageBlock, RegressionBlock(n_out=2)),
    get_items=get_image_files,
    get_y=get_coords,
    splitter=RandomSplitter(),
    item_tfms=Resize((240, 320)),
    batch_tfms=aug_transforms(do_flip=False, max_rotate=0, max_zoom=1, max_warp=0, max_lighting=0.35, p_lighting=.85)
)
dls = dbock.dataloaders(path, bs=64, shuffle=True, pin_memory=True)

In [35]:
dls.one_batch()[0][0]

TensorImage([[[0.9456, 0.8982, 0.7898,  ..., 0.2669, 0.2533, 0.2443],
              [0.9573, 0.8982, 0.7944,  ..., 0.2443, 0.2443, 0.2488],
              [0.9655, 0.8944, 0.8079,  ..., 0.2354, 0.2443, 0.2533],
              ...,
              [0.0394, 0.0394, 0.0394,  ..., 0.8907, 0.8831, 0.7619],
              [0.0418, 0.0394, 0.0418,  ..., 0.8792, 0.8713, 0.7524],
              [0.0418, 0.0418, 0.0418,  ..., 0.8792, 0.8713, 0.7476]],

             [[0.9924, 0.9601, 0.8551,  ..., 0.2578, 0.2443, 0.2354],
              [0.9993, 0.9601, 0.8551,  ..., 0.2399, 0.2399, 0.2488],
              [1.0000, 0.9629, 0.8592,  ..., 0.2310, 0.2443, 0.2488],
              ...,
              [0.0468, 0.0443, 0.0443,  ..., 0.9983, 0.9906, 0.8982],
              [0.0494, 0.0468, 0.0494,  ..., 0.9940, 0.9887, 0.8944],
              [0.0494, 0.0520, 0.0520,  ..., 0.9940, 0.9867, 0.8869]],

             [[0.9924, 0.9573, 0.8510,  ..., 0.2399, 0.2354, 0.2354],
              [0.9983, 0.9515, 0.8426,  ..., 0.2

In [36]:
import torch.nn as nn
import torch.nn.functional as F
import torch

In [37]:
class ResidualBlock(nn.Module):
    def __init__(self, n_in,n_out, kernel_sz=3, stride=1, padding=0, downsample:nn.Module = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(n_in, n_out, kernel_size=kernel_sz, stride=stride, padding=padding)
        self.conv2 = nn.Conv2d(n_out, n_out, kernel_size=kernel_sz, stride=1,padding=1)
        self.bn1 = nn.BatchNorm2d(n_out)
        self.bn2 = nn.BatchNorm2d(n_out)
        self.downsample = downsample
                               
                                       
    def forward(self, x):
        identity = x
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
                
        if self.downsample:
            identity = self.downsample(identity)
            
        x += identity
        x = F.relu(x)
        
        return x
    

In [None]:
b1 = ResidualBlock(3, 3, 1, 1).cuda()

In [38]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    


In [66]:
"""
This model is super dodgy, I'm still not sure what layout is best for it, 
it essentially just needs to extract depth, eye rotation + offset features.
Feels like I should be able to get away with far fewer parameters than the 3 mil in this model.


"""




class Resnet(nn.Module):
    def __init__(self):
        super(Resnet, self).__init__()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=5, stride=2)
        
        
        self.bn1 = nn.BatchNorm2d(64)
        
        self.downsample1 = nn.Sequential(
            nn.Conv2d(64,64, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm2d(64)
            )
        self.downsample2 = nn.Sequential(
            nn.Conv2d(64,64, kernel_size=5, stride=3),
            nn.BatchNorm2d(64)
            )

        self.layer1 = ResidualBlock(64, 64, kernel_sz=3, stride=1, padding=1)
        self.layer2 = ResidualBlock(64, 64, kernel_sz=3, stride=1, padding=1)
        self.layer3 = ResidualBlock(64, 64, kernel_sz=3, stride=1, padding=1)
        self.layer4 = ResidualBlock(64, 64, kernel_sz=3, stride=1, padding=1)
        self.layer5 = ResidualBlock(64, 64, kernel_sz=3, stride=2, padding=0, downsample=self.downsample1)
        self.layer6 = ResidualBlock(64, 64, kernel_sz=3, padding=1)
        self.layer7 = ResidualBlock(64, 64, kernel_sz=5, stride=3, padding=3, downsample=self.downsample2)


        self.fc1 = nn.LazyLinear(64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        x = tensor(torch.flatten(x, 1))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        x = self.fc3(x)
        
       
        return sigmoid_range(x, 0, 1.3)


In [67]:
resnet = Resnet().to('cuda')

In [68]:
test_x, test_y = dls.valid.one_batch()

In [69]:
test_x[0], test_y[0]

(TensorImage([[[0.8118, 0.6314, 0.6000,  ..., 0.2627, 0.2510, 0.2431],
               [0.8039, 0.6353, 0.6039,  ..., 0.2549, 0.2549, 0.2549],
               [0.7843, 0.6471, 0.6039,  ..., 0.2510, 0.2549, 0.2549],
               ...,
               [0.1725, 0.1804, 0.1765,  ..., 0.6510, 0.7098, 0.6706],
               [0.1882, 0.1882, 0.1843,  ..., 0.6118, 0.6706, 0.6784],
               [0.1882, 0.1922, 0.1961,  ..., 0.5686, 0.6471, 0.6627]],
 
              [[0.8941, 0.7020, 0.6196,  ..., 0.2706, 0.2549, 0.2471],
               [0.8902, 0.7059, 0.6196,  ..., 0.2588, 0.2549, 0.2588],
               [0.8824, 0.7216, 0.6275,  ..., 0.2471, 0.2549, 0.2588],
               ...,
               [0.1922, 0.1922, 0.1922,  ..., 0.8118, 0.8667, 0.8078],
               [0.2039, 0.2000, 0.2000,  ..., 0.7569, 0.8157, 0.8039],
               [0.2000, 0.2039, 0.2078,  ..., 0.6980, 0.7804, 0.7843]],
 
              [[0.8745, 0.6588, 0.5843,  ..., 0.2196, 0.2275, 0.2314],
               [0.8706, 0.6706,

In [70]:
result = resnet.forward(test_x[0].unsqueeze(0))
result - test_y[0]

TensorImage([[-0.1557, -0.2778]], device='cuda:0', grad_fn=<AliasBackward0>)

In [71]:
count_parameters(resnet)

+----------------------+------------+
|       Modules        | Parameters |
+----------------------+------------+
|     conv1.weight     |    4800    |
|      conv1.bias      |     64     |
|      bn1.weight      |     64     |
|       bn1.bias       |     64     |
| downsample1.0.weight |   36864    |
|  downsample1.0.bias  |     64     |
| downsample1.1.weight |     64     |
|  downsample1.1.bias  |     64     |
| downsample2.0.weight |   102400   |
|  downsample2.0.bias  |     64     |
| downsample2.1.weight |     64     |
|  downsample2.1.bias  |     64     |
| layer1.conv1.weight  |   36864    |
|  layer1.conv1.bias   |     64     |
| layer1.conv2.weight  |   36864    |
|  layer1.conv2.bias   |     64     |
|  layer1.bn1.weight   |     64     |
|   layer1.bn1.bias    |     64     |
|  layer1.bn2.weight   |     64     |
|   layer1.bn2.bias    |     64     |
| layer2.conv1.weight  |   36864    |
|  layer2.conv1.bias   |     64     |
| layer2.conv2.weight  |   36864    |
|  layer2.co

2642050

In [72]:
resnet = Resnet().cuda()
loss_func = nn.MSELoss()

In [73]:
def pixel_error(pred, targ):
    return ((pred - targ).abs() * tensor([3440, 1440]).cuda()).mean(dim=1).mean()

In [74]:
resnet = Resnet().to('cuda')
resnet.eval()
resnet.forward(test_x[0].unsqueeze(0))
resnet.train()
learn = Learner(dls, resnet, loss_func=MSELossFlat(), metrics=pixel_error).to_fp16()

In [75]:
learn.lr_find()


KeyboardInterrupt



In [76]:
learn.fit_one_cycle(3, 2e-5,wd=0.05)

epoch,train_loss,valid_loss,pixel_error,time
0,0.098569,0.095313,665.409302,01:03
1,0.089806,0.086245,630.588501,01:02
2,0.08391,0.084303,624.220581,01:03


In [77]:
learn.fit_one_cycle(10, 1e-4,wd=0.05)

epoch,train_loss,valid_loss,pixel_error,time
0,0.079848,0.078572,588.207153,01:04
1,0.075565,0.072946,546.726196,00:55
2,0.07054,0.070858,530.573425,00:50
3,0.062219,0.06447,506.459778,00:50
4,0.052526,0.057106,474.967682,00:50
5,0.042738,0.055488,460.704407,00:50
6,0.033744,0.042489,409.229675,00:50
7,0.026603,0.038257,379.007111,00:49
8,0.021834,0.036355,365.779724,00:50
9,0.019517,0.03582,362.77951,00:50


In [78]:
learn.fit_one_cycle(10, 1e-4,wd=0.05)

epoch,train_loss,valid_loss,pixel_error,time
0,0.019963,0.033806,349.055176,00:50
1,0.021193,0.031469,323.904114,00:50
2,0.020482,0.030562,301.127197,00:50
3,0.015373,0.023746,284.164307,00:50
4,0.011805,0.018478,235.613922,00:50
5,0.008903,0.017944,231.578629,00:50
6,0.007091,0.01582,220.869522,00:50
7,0.005484,0.014402,202.246872,00:50
8,0.004443,0.014552,203.454559,00:52
9,0.004113,0.014441,202.620758,00:51


In [79]:
learn.fit_one_cycle(20, 1e-4, wd=0.05)

epoch,train_loss,valid_loss,pixel_error,time
0,0.004019,0.014529,202.549133,01:03
1,0.004541,0.017427,215.583557,01:02
2,0.005355,0.016159,217.937805,01:02
3,0.006596,0.021491,260.832581,01:02
4,0.007789,0.015058,206.761429,01:03
5,0.007372,0.022185,251.199875,01:02
6,0.006719,0.014695,207.041183,01:03
7,0.005672,0.013276,196.826035,00:52
8,0.005192,0.014406,203.225647,00:50
9,0.004236,0.012071,181.710907,00:50
