## Loading Dataset

In [1]:
from IndexDataset import IndexDataset
import pandas as pd
from torch.utils.data import DataLoader

# dataset_name = "data/imdb/imdb_top_1000.csv"
# column_name = "Overview"
dataset_name = "data/lego/inventory_parts.csv"
column_name = "part_num"
batch_size = 128

df = pd.read_csv(dataset_name)
dataset = IndexDataset(df, column_name)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Training Loop

In [14]:
import lightning as L
import torch
from torch import nn, optim
import torch.nn.functional as F
import math

class LitIndexer(L.LightningModule):
    def __init__(self, mapper, indexer):
        super().__init__()
        self.mapper = mapper
        self.indexer = indexer
        self.min_loss = float("inf")
        self.epoch_losses = []
        
    def loss(self, pred_idxs, real_idxs):
        return F.mse_loss(pred_idxs, real_idxs)
    
    def percent_narrowed(self, loss):
        return math.sqrt(loss) / len(self.mapper.data)

    def training_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs.to(torch.float32))
        self.min_loss = min(self.min_loss, loss)
        # print(self.percent_narrowed(loss))
        # print(self.min_loss)
        # print(pred_idxs, idxs)
        self.epoch_losses.append(loss)
        self.log("train_loss", loss, prog_bar=True)
        
        return loss
    
    def on_train_epoch_end(self):
        avg_epoch_loss = sum(self.epoch_losses) / len(self.epoch_losses)
        print(f"Epoch {self.current_epoch}, Average Epoch Loss: {avg_epoch_loss:.4f}, Percent Narrowed: {self.percent_narrowed(avg_epoch_loss):.4f}")
        self.epoch_losses = []
    
    def test_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mappers(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs)
        self.log("test_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-2)
        return optimizer


## Baseline Hash

In [16]:
from models.BaselineHash import BaselineHash
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers
import mmh3

hash = mmh3.hash    # 32-bits
mapper = BaselineHash(dataset, hash)
indexer = LinearRegressionModel(1)

dataset = mapper.data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                  | Params
--------------------------------------------------
0 | indexer | LinearRegressionModel | 2     
--------------------------------------------------
2         Trainable params
0         Non-trainable params
2         Total params
0.000     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)
  return F.mse_loss(pred_idxs, real_idxs)


Epoch 0, Average Epoch Loss: 33762636.0000, Percent Narrowed: 0.6259
Epoch 1, Average Epoch Loss: 15921622.0000, Percent Narrowed: 0.4298
Epoch 2, Average Epoch Loss: 12810526.0000, Percent Narrowed: 0.3855
Epoch 3, Average Epoch Loss: 12598139.0000, Percent Narrowed: 0.3823
Epoch 4, Average Epoch Loss: 12571978.0000, Percent Narrowed: 0.3819
Epoch 5, Average Epoch Loss: 12572769.0000, Percent Narrowed: 0.3819
Epoch 6, Average Epoch Loss: 12572882.0000, Percent Narrowed: 0.3819
Epoch 7, Average Epoch Loss: 12559731.0000, Percent Narrowed: 0.3817
Epoch 8, Average Epoch Loss: 12575619.0000, Percent Narrowed: 0.3820
Epoch 9, Average Epoch Loss: 12575400.0000, Percent Narrowed: 0.3820
Epoch 10, Average Epoch Loss: 12585889.0000, Percent Narrowed: 0.3821
Epoch 11, Average Epoch Loss: 12550513.0000, Percent Narrowed: 0.3816
Epoch 12, Average Epoch Loss: 12563933.0000, Percent Narrowed: 0.3818
Epoch 13, Average Epoch Loss: 12568515.0000, Percent Narrowed: 0.3819
Epoch 14, Average Epoch Loss: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## Baseline Embedding

In [15]:
from models.BaselineEmbed import BaselineEmbed
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers

token_len = 1
embed_size = 1
mapper = BaselineEmbed(dataset, token_len, embed_size)
indexer = LinearRegressionModel(mapper.max_len * embed_size)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name    | Type                  | Params
--------------------------------------------------
0 | mapper  | BaselineEmbed         | 38    
1 | indexer | LinearRegressionModel | 16    
--------------------------------------------------
54        Trainable params
0         Non-trainable params
54        Total params
0.000     Total estimated model params size (MB)
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-

Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)
  return F.mse_loss(pred_idxs, real_idxs)


Epoch 0, Average Epoch Loss: 71533816.0000, Percent Narrowed: 0.5774
Epoch 1, Average Epoch Loss: 71218552.0000, Percent Narrowed: 0.5761
Epoch 2, Average Epoch Loss: 70450224.0000, Percent Narrowed: 0.5730
Epoch 3, Average Epoch Loss: 68993568.0000, Percent Narrowed: 0.5670
Epoch 4, Average Epoch Loss: 66678000.0000, Percent Narrowed: 0.5574
Epoch 5, Average Epoch Loss: 63426484.0000, Percent Narrowed: 0.5437
Epoch 6, Average Epoch Loss: 59574752.0000, Percent Narrowed: 0.5269
Epoch 7, Average Epoch Loss: 55471524.0000, Percent Narrowed: 0.5084
Epoch 8, Average Epoch Loss: 51179880.0000, Percent Narrowed: 0.4884
Epoch 9, Average Epoch Loss: 46987280.0000, Percent Narrowed: 0.4679
Epoch 10, Average Epoch Loss: 42977568.0000, Percent Narrowed: 0.4475
Epoch 11, Average Epoch Loss: 39282964.0000, Percent Narrowed: 0.4279
Epoch 12, Average Epoch Loss: 36018684.0000, Percent Narrowed: 0.4097
Epoch 13, Average Epoch Loss: 33229036.0000, Percent Narrowed: 0.3935
Epoch 14, Average Epoch Loss: 

/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
trainer.test(model, dataloaders=dataloader)