## Loading Dataset

In [57]:
from IndexDataset import IndexDataset
import pandas as pd
from torch.utils.data import DataLoader

# dataset_name = "data/imdb/imdb_top_1000.csv"
# column_name = "Overview"
dataset_name = "data/lego/inventory_parts.csv"
column_name = "part_num"
batch_size = 128

df = pd.read_csv(dataset_name)
dataset = IndexDataset(df, column_name)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Training Loop

In [50]:
import lightning as L
import torch
from torch import nn, optim
import torch.nn.functional as F
import math

class LitIndexer(L.LightningModule):
    def __init__(self, mapper, indexer, name):
        super().__init__()
        self.mapper = mapper
        self.indexer = indexer
        self.min_loss = float("inf")
        self.epoch_losses = []
        self.test_losses = []
        self.results_df = pd.DataFrame(columns=["Name", "Test Loss", "Percent Narrowed"])
        
    def forward(self, strs):
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        return pred_idxs
        
    def loss(self, pred_idxs, real_idxs):
        return F.mse_loss(pred_idxs, real_idxs)
    
    def percent_narrowed(self, loss):
        return math.sqrt(loss) / len(self.mapper.data)

    def training_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs.to(torch.float32))
        self.min_loss = min(self.min_loss, loss)
        self.epoch_losses.append(loss)
        self.log("train_loss", loss, prog_bar=True)
        
        return loss
    
    def on_train_epoch_end(self):
        avg_epoch_loss = sum(self.epoch_losses) / len(self.epoch_losses)
        print(f"Epoch {self.current_epoch}, Average Epoch Loss: {avg_epoch_loss:.4f}, Percent Narrowed: {self.percent_narrowed(avg_epoch_loss):.4f}")
        self.epoch_losses = []
    
    def test_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mappers(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs)
        self.log("test_loss", loss, prog_bar=True)
        
    def on_test_epoch_end(self):
        avg_test_loss = sum(self.test_losses) / len(self.test_losses)
        print(f"Average Test Loss: {avg_test_loss:.4f}, Percent Narrowed: {self.percent_narrowed(avg_test_loss):.4f}")
        self.test_losses = []
        self.results_df = self.results_df.append({"Experiment": self.name, 
                                                  "Test Loss": avg_test_loss, 
                                                  "Percent Narrowed": self.percent_narrowed(avg_test_loss)}, 
                                                 ignore_index=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-2)
        return optimizer


## Experiment Runner

In [None]:
from models.BaselineHash import BaselineHash
from models.BaselineEmbed import BaselineHash
from models.LinearModel import LinearRegressionModel
import mmh3

experiments = {
            "__name": {"dataset": None, "mapper": None, "indexer": None}
            }

In [None]:
from pytorch_lightning import loggers as pl_loggers

for name, experiment_dict in experiments.items():
    dataset, mapper, indexer = experiment_dict["dataset"], experiment_dict["mapper"], experiment_dict["indexer"]
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = LitIndexer(mapper, indexer, name)
    
    tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
    trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
    trainer.fit(model, train_dataloaders=dataloader)
    
    torch.save(model.state_dict(), f"models/{name}.pth")
    
    trainer.test(f"models/{name}.pth")
    

## Baseline Hash

In [None]:
from models.BaselineHash import BaselineHash
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers
import mmh3

hash = mmh3.hash    # 32-bits
mapper = BaselineHash(dataset, hash)
indexer = LinearRegressionModel(1)

dataset = mapper.data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

In [52]:
torch.save(model.state_dict(), "models/basehash_lego.pth")

In [59]:
model = LitIndexer(mapper, indexer)
model.load_state_dict(torch.load("models/basehash_lego.pth"))
model.eval()

LitIndexer(
  (indexer): LinearRegressionModel(
    (linear): Linear(in_features=1, out_features=1, bias=True)
  )
)

In [None]:
model.test_step()

In [None]:
test_dataloader = DataLoader(dataset, batch_size=1)

for strs, idxs in test_dataloader:
    pred_idx = round(model(strs).item())
    print(pred_idx, idxs.item())

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## Baseline Embedding

In [54]:
from models.BaselineEmbed import BaselineEmbed
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers

token_len = 1
embed_size = 1
mapper = BaselineEmbed(dataset, token_len, embed_size)
indexer = LinearRegressionModel(mapper.max_len * embed_size)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name    | Type                  | Params
--------------------------------------------------
0 | mapper  | BaselineEmbed         | 40    
1 | indexer | LinearRegressionModel | 16    
--------------------------------------------------
56        Trainable params
0         Non-trainable params
56        Total params
0.000     Total estimated model params size (MB)
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-

Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)
  return F.mse_loss(pred_idxs, real_idxs)


Epoch 0, Average Epoch Loss: 177892656.0000, Percent Narrowed: 0.5766
Epoch 1, Average Epoch Loss: 174817744.0000, Percent Narrowed: 0.5716
Epoch 2, Average Epoch Loss: 168488800.0000, Percent Narrowed: 0.5612
Epoch 3, Average Epoch Loss: 159550592.0000, Percent Narrowed: 0.5461
Epoch 4, Average Epoch Loss: 148804304.0000, Percent Narrowed: 0.5274
Epoch 5, Average Epoch Loss: 136966352.0000, Percent Narrowed: 0.5060
Epoch 6, Average Epoch Loss: 124619840.0000, Percent Narrowed: 0.4826
Epoch 7, Average Epoch Loss: 112519488.0000, Percent Narrowed: 0.4586
Epoch 8, Average Epoch Loss: 101114984.0000, Percent Narrowed: 0.4347
Epoch 9, Average Epoch Loss: 90910808.0000, Percent Narrowed: 0.4122
Epoch 10, Average Epoch Loss: 82065712.0000, Percent Narrowed: 0.3916
Epoch 11, Average Epoch Loss: 74734544.0000, Percent Narrowed: 0.3737
Epoch 12, Average Epoch Loss: 69012056.0000, Percent Narrowed: 0.3591
Epoch 13, Average Epoch Loss: 64633840.0000, Percent Narrowed: 0.3476
Epoch 14, Average Epo

/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [55]:
torch.save(model.state_dict(), "models/baseembed_lego.pth")

In [29]:
model = LitIndexer(mapper, indexer)
model.load_state_dict(torch.load("models/baseembed_lego.pth"))
model.eval()

LitIndexer(
  (mapper): BaselineEmbed(
    (embed): Embedding(40, 1)
  )
  (indexer): LinearRegressionModel(
    (linear): Linear(in_features=15, out_features=1, bias=True)
  )
)

In [None]:
test_dataloader = DataLoader(dataset, batch_size=1)

for strs, idxs in test_dataloader:
    pred_idx = round(model(strs).item())
    print(pred_idx, idxs.item())