In [None]:
import torch
import torch.nn as nn
from torch.utils.data import random_split, DataLoader

from dataset import ChemDataset, causal_mask
import CGPT_tokenizer
from config import get_config, get_weights_file_path, latest_weights_file_path

from pathlib import Path
import pandas as pd
from CGPT_utils import *
import os
import warnings
from tqdm import tqdm

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def get_ds(config):
    
    
    return train_dataloader, val_dataloader, chem_tokenizer, text_tokenizer

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model


In [None]:

# for i, batch in enumerate(train_dataloader):
#     if i >= 5:  # Limit to the first 5 batches
#         break
    
#     # Print the batch
#     print(f"Batch {i + 1}:")
#     for key, value in batch.items():
#         print(f"{key}: {value}")
#     print("\n")

In [None]:
from config import get_config
config = get_config()
config['batch_size'] = 6
config['preload'] = None
config['num_epochs'] = 30


In [None]:


device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
    print(f"Device name: {torch.cuda.get_device_name(device.index)}")
    print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
elif (device == 'mps'):
    print(f"Device name: <mps>")
else:
    print("NOTE: If you have a GPU, consider using it for training.")
    print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
    print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
device = torch.device(device)

# Make sure the weights folder exists
Path(f"{config['model_folder']}").mkdir(parents=True, exist_ok=True)


chem_vocab_size = 500
chem_seq_length = 32
text_seq_length = 32

chem_tokenizer = CGPT_tokenizer.make_custum_tokenizer(csv_path=config["SMILES dataset"], column="SMILES", vocab_size=chem_vocab_size)
text_tokenizer = CGPT_tokenizer.make_default_tokenizer()



data = pd.read_csv(config["SMILES dataset"])

train_ds_size = int(0.9*len(data))
validation_ds_size = len(data) - train_ds_size
# train_ds_raw, val_ds_raw = random_split(data, [train_ds_size, validation_ds_size])
train_ds_raw = data

print(data.head())

train_ds = ChemDataset(train_ds_raw, text_tokenizer, chem_tokenizer, config['src_lang'], config['tgt_format'], config['seq_len'])
# validation_ds = ChemDataset(val_ds_raw, text_tokenizer, chem_tokenizer, config['src_lang'], config['tgt_format'], config['seq_len'])

train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
# val_dataloader = DataLoader(validation_ds, batch_size=1, shuffle=True)

for i in range(min(5, len(train_ds))):  # Print the first 5 items or less if the dataset is smaller
    item = train_ds[i]
    print(f"Item {i + 1}:")
    for key, value in item.items():
        print(f"{key}: {value}")
    print("\n")
    
    
model = get_model(config, text_tokenizer.vocab_size, chem_tokenizer.vocab_size).to(device)
# Tensorboard
writer = SummaryWriter(config['experiment_name'])

optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

# If the user specified a model to preload before training, load it
initial_epoch = 0
global_step = 0
preload = config['preload']
model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
if model_filename:
    print(f'Preloading model {model_filename}')
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
else:
    print('No model to preload, starting from scratch')

loss_fn = nn.CrossEntropyLoss(ignore_index=text_tokenizer.encode('<pad>', add_special_tokens = False), label_smoothing=0.1).to(device)

for epoch in range(initial_epoch, config['num_epochs']):
    torch.cuda.empty_cache()
    model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:

        encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
        decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

Using device: cpu
NOTE: If you have a GPU, consider using it for training.
      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc
      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu
found tokenizer already.


             Name                                             SMILES  \
0    Benzaldehyde                                   C1=CC=C(C=C1)C=O   
1         Aspirin                           CC(=O)OC1=CC=CC=C1C(=O)O   
2  EGFR Inhibitor   C1CC1C(=O)NC2=CC=CC(=C2)NC3=NC=NC(=C3)NC4=CC=...   

                                         Description  
0   Benzaldehyde appears as a clear colorless to ...  
1   Acetylsalicylic acid appears as odorless whit...  
2                                             Ur mom  


AttributeError: 'ChemDataset' object has no attribute 'tokenizer'