## Loading Dataset

In [1]:
from IndexDataset import IndexDataset
import pandas as pd
from torch.utils.data import DataLoader

dataset_name = "data/lego/inventory_parts.csv"
column_name = "part_num"
batch_size = 64

df = pd.read_csv(dataset_name)
dataset = IndexDataset(df, column_name)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [2]:
dataset.get_series()

0        0687b1
1          0901
2          0902
3          0903
4          0904
          ...  
23126    zbb013
23127    zbb014
23128    zbb015
23129    zbb018
23130    zbb022
Name: part_num, Length: 23131, dtype: object

## Training Loop

In [3]:
import lightning as L
import torch
from torch import nn, optim
import torch.nn.functional as F

class LitIndexer(L.LightningModule):
    def __init__(self, mapper, indexer):
        super().__init__()
        self.mapper = mapper
        self.indexer = indexer
        self.min_loss = float("inf")
        
    def loss(self, pred_idxs, real_idxs):
        return F.mse_loss(pred_idxs, real_idxs)

    def training_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs.to(torch.float32))
        self.min_loss = min(self.min_loss, loss)
        print(self.min_loss)
        # print(pred_idxs, idxs)
        self.log("train_loss", loss, prog_bar=True)
        
        return loss
    
    def test_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mappers(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs)
        self.log("test_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


In [4]:
from models.BaselineHash import BaselineHash
import mmh3

hash = mmh3.hash    # 32-bits
mapper = BaselineHash(dataset, hash)

dataset = mapper.data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [5]:
dataset.get_series('digest')

0            1
1            2
2            5
3            7
4           10
         ...  
14644    23118
14645    23121
14646    23126
14647    23127
14648    23130
Name: digest, Length: 14649, dtype: object

In [6]:
from models.BaselineHash import BaselineHash
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers
import mmh3

hash = mmh3.hash    # 32-bits
mapper = BaselineHash(dataset, hash)
indexer = LinearRegressionModel(1)

dataset = mapper.data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name    | Type                  | Params
--------------------------------------------------
0 | indexer | LinearRegressionModel | 2     
--------------------------------------------------
2         Trainable params
0         Non-trainable params
2         Total params
0.000     Total estimated model params size (MB)
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors

Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)


tensor(30080628., grad_fn=<MseLossBackward0>)
tensor(30080628., grad_fn=<MseLossBackward0>)
tensor(30080628., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27175682., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLossBackward0>)
tensor(27092622., grad_fn=<MseLoss

  return F.mse_loss(pred_idxs, real_idxs)


tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLossBackward0>)
tensor(19152866., grad_fn=<MseLoss

/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 44126), started 0:00:26 ago. (Use '!kill 44126' to kill it.)

In [None]:
trainer.test(model, dataloaders=dataloader)