# Train Generative Model on Peptide Binding Dataset
### Import libraries

In [1]:
import sys
sys.path.append('..')
from peptide_array_generative.datasets.peptides import PeptideDataset
from peptide_array_generative.models.film import FiLMNet
from peptide_array_generative.models.mlp import MLP
from peptide_array_generative.diffusion.schedules import CosineSchedule
from peptide_array_generative.diffusion.categorical import MultinomialDiffusion
from peptide_array_generative.trainers.regression import RegressionTrainer
import random
from torch.utils.data import DataLoader

### Create dataloaders

In [2]:
random_seed = random.randint(0, 1e9)
data_loader_train = DataLoader(PeptideDataset(
    dataset_path='../data/peptides/Average_Uninfected.csv',
    train=True,
    remove_gsg=True,
    random_seed=random_seed
), batch_size=32, shuffle=True)
data_loader_test = DataLoader(PeptideDataset(
    dataset_path='../data/peptides/Average_Uninfected.csv',
    train=False,
    remove_gsg=True,
    random_seed=random_seed
), batch_size=1000, shuffle=True)

### Train validation model

In [3]:
sequences, labels = next(iter(data_loader_train))
neural_network = MLP(
    input_dim=sequences.shape[1] * sequences.shape[2],
    hidden_dim=256,
    output_dim=1
)
validation_model = RegressionTrainer(
    data_loader_train=data_loader_train,
    data_loader_test=data_loader_test,
    neural_network=neural_network
)
validation_model.train()

2025-03-22 22:35:29 - Epoch 0
100%|██████████| 3458/3458 [00:34<00:00, 99.56it/s, loss=0.0916] 
2025-03-22 22:36:04 - Test loss: 0.04657
2025-03-22 22:36:04 - Epoch 1
100%|██████████| 3458/3458 [00:31<00:00, 110.19it/s, loss=0.0396]
2025-03-22 22:36:36 - Test loss: 0.04412
2025-03-22 22:36:36 - Epoch 2
100%|██████████| 3458/3458 [00:31<00:00, 109.95it/s, loss=0.0228]
2025-03-22 22:37:07 - Test loss: 0.04358
2025-03-22 22:37:07 - Epoch 3
100%|██████████| 3458/3458 [00:31<00:00, 108.92it/s, loss=0.026] 
2025-03-22 22:37:39 - Test loss: 0.04369
2025-03-22 22:37:39 - Epoch 4
100%|██████████| 3458/3458 [00:31<00:00, 109.21it/s, loss=0.037] 
2025-03-22 22:38:11 - Test loss: 0.04532
2025-03-22 22:38:11 - Epoch 5
100%|██████████| 3458/3458 [00:31<00:00, 109.01it/s, loss=0.0313]
2025-03-22 22:38:43 - Test loss: 0.04126
2025-03-22 22:38:43 - Epoch 6
100%|██████████| 3458/3458 [00:32<00:00, 105.98it/s, loss=0.0562]
2025-03-22 22:39:15 - Test loss: 0.03891
2025-03-22 22:39:16 - Epoch 7
100%|██████

### Train generative model

In [6]:
neural_network = FiLMNet(
    input_dim=sequences.shape[1] * sequences.shape[2],
    hidden_dim=256,
    output_dim=sequences.shape[1] * sequences.shape[2],
    condition_dim=labels.shape[-1],
    hidden_layers=1
)
noise_schedule = CosineSchedule(num_steps=100)
MultinomialDiffusion(
    data_loader=data_loader_train,
    neural_network=neural_network,
    noise_schedule=noise_schedule
).train(validation_model=validation_model.model)

2025-03-22 22:54:34 - Epoch 0
100%|██████████| 3458/3458 [01:07<00:00, 50.99it/s, loss=0.149] 
99it [00:00, 138.35it/s]
2025-03-22 22:55:43 - Validation loss: 0.56969
2025-03-22 22:55:43 - Epoch 1
100%|██████████| 3458/3458 [01:05<00:00, 53.04it/s, loss=0.103] 
99it [00:00, 231.68it/s]
2025-03-22 22:56:49 - Validation loss: 0.40164
2025-03-22 22:56:49 - Epoch 2
100%|██████████| 3458/3458 [01:05<00:00, 52.71it/s, loss=0.144] 
99it [00:00, 231.84it/s]
2025-03-22 22:57:55 - Validation loss: 0.25522
2025-03-22 22:57:55 - Epoch 3
100%|██████████| 3458/3458 [01:05<00:00, 52.46it/s, loss=0.187] 
99it [00:00, 241.31it/s]
2025-03-22 22:59:02 - Validation loss: 0.16235
2025-03-22 22:59:02 - Epoch 4
100%|██████████| 3458/3458 [01:02<00:00, 54.98it/s, loss=0.183] 
99it [00:00, 242.90it/s]
2025-03-22 23:00:05 - Validation loss: 0.16017
2025-03-22 23:00:05 - Epoch 5
100%|██████████| 3458/3458 [01:04<00:00, 53.59it/s, loss=0.0979]
99it [00:00, 234.22it/s]
2025-03-22 23:01:10 - Validation loss: 0.1482