## Loading Dataset

In [26]:
from IndexDataset import IndexDataset
import pandas as pd
from torch.utils.data import DataLoader

# dataset_name = "data/imdb/imdb_top_1000.csv"
# column_name = "Overview"
dataset_name = "data/lego/inventory_parts.csv"
column_name = "part_num"
batch_size = 128

df = pd.read_csv(dataset_name)
dataset = IndexDataset(df, column_name)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Training Loop

In [18]:
import lightning as L
import torch
from torch import nn, optim
import torch.nn.functional as F
import math

class LitIndexer(L.LightningModule):
    def __init__(self, mapper, indexer):
        super().__init__()
        self.mapper = mapper
        self.indexer = indexer
        self.min_loss = float("inf")
        self.epoch_losses = []
        
    def forward(self, strs):
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        return pred_idxs
        
    def loss(self, pred_idxs, real_idxs):
        return F.mse_loss(pred_idxs, real_idxs)
    
    def percent_narrowed(self, loss):
        return math.sqrt(loss) / len(self.mapper.data)

    def training_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mapper.forward(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs.to(torch.float32))
        self.min_loss = min(self.min_loss, loss)
        self.epoch_losses.append(loss)
        self.log("train_loss", loss, prog_bar=True)
        
        return loss
    
    def on_train_epoch_end(self):
        avg_epoch_loss = sum(self.epoch_losses) / len(self.epoch_losses)
        print(f"Epoch {self.current_epoch}, Average Epoch Loss: {avg_epoch_loss:.4f}, Percent Narrowed: {self.percent_narrowed(avg_epoch_loss):.4f}")
        self.epoch_losses = []
    
    def test_step(self, batch, batch_idx):
        strs, idxs = batch
        mapped_strs = self.mappers(strs)
        pred_idxs = self.indexer(mapped_strs)
        loss = self.loss(pred_idxs, idxs)
        self.log("test_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-2)
        return optimizer


## Baseline Hash

In [19]:
from models.BaselineHash import BaselineHash
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers
import mmh3

hash = mmh3.hash    # 32-bits
mapper = BaselineHash(dataset, hash)
indexer = LinearRegressionModel(1)

dataset = mapper.data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name    | Type                  | Params
--------------------------------------------------
0 | indexer | LinearRegressionModel | 2     
--------------------------------------------------
2         Trainable params
0         Non-trainable params
2         Total params
0.000     Total estimated model params size (MB)
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors

Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)
  return F.mse_loss(pred_idxs, real_idxs)


Epoch 0, Average Epoch Loss: 31570820.0000, Percent Narrowed: 0.3836
Epoch 1, Average Epoch Loss: 31431136.0000, Percent Narrowed: 0.3827
Epoch 2, Average Epoch Loss: 31451196.0000, Percent Narrowed: 0.3828
Epoch 3, Average Epoch Loss: 31433852.0000, Percent Narrowed: 0.3827
Epoch 4, Average Epoch Loss: 31481050.0000, Percent Narrowed: 0.3830
Epoch 5, Average Epoch Loss: 31448432.0000, Percent Narrowed: 0.3828
Epoch 6, Average Epoch Loss: 31438156.0000, Percent Narrowed: 0.3828
Epoch 7, Average Epoch Loss: 31446152.0000, Percent Narrowed: 0.3828
Epoch 8, Average Epoch Loss: 31444176.0000, Percent Narrowed: 0.3828
Epoch 9, Average Epoch Loss: 31470914.0000, Percent Narrowed: 0.3830
Epoch 10, Average Epoch Loss: 31404932.0000, Percent Narrowed: 0.3826
Epoch 11, Average Epoch Loss: 31439932.0000, Percent Narrowed: 0.3828
Epoch 12, Average Epoch Loss: 31415336.0000, Percent Narrowed: 0.3826
Epoch 13, Average Epoch Loss: 31451612.0000, Percent Narrowed: 0.3828
Epoch 14, Average Epoch Loss: 

/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [20]:
torch.save(model.state_dict(), "models/base_hash.pth")

In [21]:
model = LitIndexer(mapper, indexer)
model.load_state_dict(torch.load("models/base_hash.pth"))
model.eval()

LitIndexer(
  (indexer): LinearRegressionModel(
    (linear): Linear(in_features=1, out_features=1, bias=True)
  )
)

In [25]:
test_dataloader = DataLoader(dataset, batch_size=1)

for strs, idxs in test_dataloader:
    pred_idx = round(model(strs).item())
    print(pred_idx, idxs.item())

11192 0
10422 1
2961 2
9603 3
5019 4
10362 5
4148 6
5277 7
4573 8
309 9
5742 10
9379 11
564 12
4501 13
9897 14
5199 15
5472 16
8041 17
9513 18
1777 19
8103 20
2210 21
5074 22
7235 23
8988 24
2348 25
1324 26
4656 27
1947 28
7327 29
1372 30
9443 31
8597 32
90 33
4526 34
1994 35
2951 36
10645 37
2535 38
8627 39
4919 40
7897 41
2890 42
687 43
9837 44
5772 45
343 46
925 47
4646 48
3574 49
6929 50
2300 51
8964 52
1205 53
6264 54
10434 55
2711 56
792 57
3998 58
7470 59
5756 60
655 61
8782 62
7233 63
46 64
321 65
1796 66
5971 67
9634 68
5357 69
3249 70
1429 71
6473 72
5549 73
3137 74
1055 75
3690 76
2769 77
1002 78
9052 79
8956 80
3787 81
5557 82
8964 83
672 84
2041 85
8133 86
6788 87
4998 88
2442 89
5943 90
7771 91
6511 92
8534 93
7734 94
7546 95
8676 96
601 97
4146 98
4957 99
7927 100
9040 101
8232 102
1593 103
7872 104
6489 105
2444 106
9593 107
10979 108
9477 109
6607 110
7372 111
5029 112
3162 113
562 114
9587 115
661 116
7228 117
5884 118
3367 119
9866 120
5624 121
11048 122
8434 123
594

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## Baseline Embedding

In [27]:
from models.BaselineEmbed import BaselineEmbed
from models.LinearModel import LinearRegressionModel
from pytorch_lightning import loggers as pl_loggers

token_len = 1
embed_size = 1
mapper = BaselineEmbed(dataset, token_len, embed_size)
indexer = LinearRegressionModel(mapper.max_len * embed_size)

model = LitIndexer(mapper, indexer)

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
trainer = L.Trainer(accelerator="cpu", logger=tb_logger)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name    | Type                  | Params
--------------------------------------------------
0 | mapper  | BaselineEmbed         | 40    
1 | indexer | LinearRegressionModel | 16    
--------------------------------------------------
56        Trainable params
0         Non-trainable params
56        Total params
0.000     Total estimated model params size (MB)
/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-

Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(pred_idxs, real_idxs)
  return F.mse_loss(pred_idxs, real_idxs)


Epoch 0, Average Epoch Loss: 177849552.0000, Percent Narrowed: 0.5765
Epoch 1, Average Epoch Loss: 174741440.0000, Percent Narrowed: 0.5715
Epoch 2, Average Epoch Loss: 168409472.0000, Percent Narrowed: 0.5610
Epoch 3, Average Epoch Loss: 159533344.0000, Percent Narrowed: 0.5460
Epoch 4, Average Epoch Loss: 148791280.0000, Percent Narrowed: 0.5273
Epoch 5, Average Epoch Loss: 136944336.0000, Percent Narrowed: 0.5059
Epoch 6, Average Epoch Loss: 124680384.0000, Percent Narrowed: 0.4827
Epoch 7, Average Epoch Loss: 112585736.0000, Percent Narrowed: 0.4587
Epoch 8, Average Epoch Loss: 101169696.0000, Percent Narrowed: 0.4348
Epoch 9, Average Epoch Loss: 90848192.0000, Percent Narrowed: 0.4121
Epoch 10, Average Epoch Loss: 82091360.0000, Percent Narrowed: 0.3917
Epoch 11, Average Epoch Loss: 74765096.0000, Percent Narrowed: 0.3738
Epoch 12, Average Epoch Loss: 69001976.0000, Percent Narrowed: 0.3591
Epoch 13, Average Epoch Loss: 64593184.0000, Percent Narrowed: 0.3475
Epoch 14, Average Epo

/Users/alexanderkumar/miniconda3/envs/graphs/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [28]:
torch.save(model.state_dict(), "models/base_embed.pth")

In [29]:
model = LitIndexer(mapper, indexer)
model.load_state_dict(torch.load("models/base_embed.pth"))
model.eval()

LitIndexer(
  (mapper): BaselineEmbed(
    (embed): Embedding(40, 1)
  )
  (indexer): LinearRegressionModel(
    (linear): Linear(in_features=15, out_features=1, bias=True)
  )
)

In [30]:
test_dataloader = DataLoader(dataset, batch_size=1)

for strs, idxs in test_dataloader:
    pred_idx = round(model(strs).item())
    print(pred_idx, idxs.item())

11627 0
9209 1
9363 2
9405 3
9267 4
2371 5
12281 6
11845 7
11878 8
11999 9
11920 10
11095 11
10862 12
10957 13
11462 14
10763 15
10862 16
11589 17
10983 18
11015 19
10907 20
11413 21
11403 22
11443 23
11380 24
11536 25
11527 26
10900 27
10782 28
10902 29
10935 30
10827 31
10857 32
11300 33
11291 34
11288 35
12202 36
10896 37
12247 38
11049 39
11554 40
11545 41
11030 42
11096 43
11601 44
11063 45
11219 46
12165 47
12159 48
12157 49
11594 50
11585 51
12130 52
12083 53
11945 54
11038 55
11543 56
12072 57
11463 58
12191 59
11431 60
11419 61
11875 62
11047 63
11014 64
11233 65
11266 66
11151 67
12075 68
11967 69
11647 70
11644 71
11652 72
11650 73
11655 74
11643 75
12512 76
11099 77
11114 78
12060 79
11929 80
10969 81
12134 82
11221 83
11236 84
11201 85
10930 86
11051 87
12123 88
12015 89
11095 90
12136 91
12058 92
12052 93
11224 94
11380 95
12319 96
12352 97
12274 98
11962 99
11955 100
11953 101
11284 102
12220 103
11192 104
12753 105
11144 106
11479 107
11470 108
11467 109
11600 110
11524