The code is available on my GitHub: https://github.com/Akhilez/vision_lab/blob/master/companies/bungee/main.ipynb

The logs are available on my wandb experiment: https://wandb.ai/akhilez/bungee_test?workspace=user-akhilez

Hacker rank does not have the required packages installed, so I'm submitting this notebook instead.

I could try various different hyperparameters to get the best accuracy, but didn't find enough time for experimentation.

In [None]:
from typing import Optional
import numpy as np
import torch
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchmetrics import AverageMeter, MetricCollection, Accuracy, Precision, Recall

In [12]:

class ImageDataset(Dataset):
    def __init__(self, data, is_test=False):
        self.data = data
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]
        image = data[:294]
        classes = data[294:]

        image = torch.from_numpy(np.array(image)).float()
        if not self.is_test:
            classes = torch.tensor(np.array(classes).argmax(axis=0)).long()
            return image, classes

        return image

In [13]:
class ImageDataModule(pl.LightningDataModule):
    def __init__(
        self,
        batch_size: int,
        val_split: float,
        **_,
    ):
        super().__init__()

        self.h = 28
        self.w = 28
        self.dims = (1, self.h, self.w)

        self.batch_size = batch_size
        self.val_split = val_split

        self.data_train, self.data_val, self.data_test = None, None, None

    def setup(self, stage: Optional[str] = None):
        data_full = pd.read_csv('./train.csv', header=None)
        data_train, data_val = train_test_split(data_full, test_size=self.val_split)
        self.data_train = ImageDataset(data_train)
        self.data_val = ImageDataset(data_val)

        data_test = pd.read_csv('./test.csv', header=None)
        self.data_test = ImageDataset(data_test, is_test=True)

    def train_dataloader(self):
        return DataLoader(
            self.data_train,
            batch_size=self.batch_size,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.data_val,
            batch_size=self.batch_size,
            shuffle=False,
        )

    def test_dataloader(self):
        return DataLoader(
            self.data_test,
            batch_size=self.batch_size,
            shuffle=False,
        )

In [35]:
class LinearBlock(nn.Sequential):
    def __init__(self, in_units: int, out_units: int):
        super().__init__(
            nn.Linear(in_units, out_units),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),
        )


class ImageModel(pl.LightningModule):
    def __init__(self, **hp):
        super().__init__()
        self.num_classes = 6

        self.model = nn.Sequential(
            LinearBlock(294, 500),
            LinearBlock(500, self.num_classes),
        )

        self.hp = hp
        self.save_hyperparameters(hp)

        self.criterion = nn.CrossEntropyLoss()

        # Metrics
        self.loss_train = AverageMeter()
        self.loss_val = AverageMeter()
        self.metrics_train = MetricCollection({
            'accuracy': Accuracy(),
            'precision': Precision(average='macro', num_classes=self.num_classes),
            'recall': Recall(average='macro', num_classes=self.num_classes)
        }, prefix='train/')
        self.metrics_val = self.metrics_train.clone(prefix='val/')
        self.preds_test = []

    def forward(self, x):
        return self.model(x)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hp["lr_initial"])
        lr_scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.hp['lr_decay_every'],
            gamma=self.hp['lr_decay_by'],
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                "frequency": 1,
                "name": "learning_rate"
            }
        }

    def training_step(self, batch, batch_idx):
        images, targets = batch
        preds = self(images)
        loss = self.criterion(preds, targets)
        return {
            "loss": loss,
            "preds": preds.detach(),
            "inputs": images,
            "targets": targets,
        }

    def training_step_end(self, outs: dict):
        self.loss_train(outs["loss"])
        self.metrics_train(outs["preds"], outs["targets"])
        self.log("train/accuracy_step", self.metrics_train['accuracy'], prog_bar=True)

    def training_epoch_end(self, outs: dict):
        self.log("train/loss", self.loss_train.compute())
        self.log_dict(self.metrics_train.compute())
        self.log('learning_rate', self.lr_schedulers().get_last_lr()[0])

    def validation_step(self, batch, batch_idx):
        images, targets = batch
        preds = self(images)
        loss = self.criterion(preds, targets)
        return {
            "loss": loss,
            "preds": preds.detach(),
            "inputs": images,
            "targets": targets,
        }

    def validation_step_end(self, outs: dict):
        self.metrics_val(outs["preds"], outs["targets"])
        self.loss_val(outs["loss"])

    def on_validation_epoch_end(self) -> None:
        self.log_dict(self.metrics_val.compute())
        self.log("val/loss", self.loss_val.compute())

    def test_step(self, batch, batch_idx):
        preds = self(batch)
        preds = preds.detach().argmax(dim=1)
        return preds

    def test_step_end(self, preds):
        for pred in preds:
            self.preds_test.append(int(pred))


In [None]:
def save_predictions(preds):
    with open('./prediction.csv', 'w') as output:
        for pred in preds:
            out = list(range(6))
            out[pred] = 1
            output.write(','.join(out) + '\n')

In [36]:
hp = {
    "epochs": 200,
    "lr_initial": 0.0001,
    "lr_decay_every": 20,
    "lr_decay_by": 0.99,
}

config = {
    "batch_size": 64,
    'output_path': './output',
    'val_split': 0.1,
}

dataset = ImageDataModule(**config)
model = ImageModel(**hp, **config).float()
wandb_logger = WandbLogger(project="bungee_test", log_model=True)
trainer = pl.Trainer(
    gpus=0,
    max_epochs=hp["epochs"],
    default_root_dir=config["output_path"],
    logger=wandb_logger,
)
wandb_logger.watch(model)

trainer.fit(model, datamodule=dataset)
trainer.test(model, datamodule=dataset)
save_predictions(model.preds_test)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name          | Type             | Params
---------------------------------------------------
0 | model         | Sequential       | 150 K 
1 | criterion     | CrossEntropyLoss | 0     
2 | loss_train    | AverageMeter     | 0     
3 | loss_val      | AverageMeter     | 0     
4 | metrics_train | MetricCollection | 0     
5 | metrics_val   | MetricCollection | 0     
---------------------------------------------------
150 K     Trainable params
0         Non-trainable params
150 K     Total params
0.602     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [40]:
x = [ 0, 10, 15]
y = [ 0, 10, 20]


def closestSquaredDistance(x, y):
    x = np.array(x)
    y = np.array(y)
    diff = np.absolute(y - x)

    k = 2
    idx = np.argpartition(diff, k)[:k]

    xs = x[idx]
    ys = y[idx]

    print(xs)
    print(ys)

    print(np.diff(xs))
    print(np.diff(ys))

    result = np.diff(xs) ** 2 + np.diff(ys) ** 2

    return result

print(closestSquaredDistance(x, y))

[ 0 10]
[ 0 10]
[10]
[10]
[200]
