# Vojta




In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install portalocker
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm 
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m659.5/659.5 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.

## The dataset

 We wish to train our classifier on the compounds dataset available on PubCHEM. In PubCHEM terminology, a compound is a record summarizing at least one physical sample that can be obtained, analyzed, or used in experiment. Each compound record has differing amount of information, and only a subset contains both the SMILES and IUPAC names. Such compounds make up our dataset.
 To prepare the dataset we access the whole database through FTP. This is preferrable to programatic approach that is rate limited. The downloaded database consists of approximatelly 168 million records, totalling approximatelly 270 gigabytes of compressed data. These data are available in chunks of XML files, containing 500 000 records each. A size of one file is typically around 20 gigabytes.  Due to the size of the data, we iterate over each XML file in a stream-like manner, and try to extract the id, SMILES, IUPAC name. Those compounds that contain all three are
 saved into a tsv file. 


In [None]:
import pandas as pd
df = pd.read_csv('concatenated_data.tsv', delimiter='\t')

In [None]:

import os
import torch
from torch.utils.data import Dataset, Subset, random_split


# Custom dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.data[" SMILES"][idx], self.data[" IUPAC"][idx])



# Create a custom dataset
dataset = MyDataset(df)

# Define the sizes for train, validation, and test datasets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset into train, validation, and test datasets
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42)
)


In [None]:
df

Unnamed: 0,CID,SMILES,IUPAC
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,3-acetoxy-4-(trimethylammonio)butanoate
1,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,(2-acetoxy-3-carboxy-propyl)-trimethyl-ammonium
2,3,C1=CC(C(C(=C1)C(=O)O)O)O,"5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ..."
3,11,C(CCl)Cl,"1,2-dichloroethane"
4,18,C(C(C(C(=O)C(=O)C(=O)O)O)O)O,"4,5,6-trihydroxy-2,3-dioxo-hexanoic acid"
...,...,...,...
376447,55226682,[B-](CN1C=CC=NC1=O)(F)(F)F,trifluoro-[(2-oxopyrimidin-1-yl)methyl]boranuide
376448,55226712,C1=CC(=C(C=C1SC2=NC=NC=C2)Cl)C(=O)O,2-chloro-4-pyrimidin-4-ylsulfanyl-benzoic acid
376449,55226859,C1=CC(=C(C=C1F)CNS(=O)(=O)C2=C(C=CS2)Br)F,"3-bromo-N-[(2,5-difluorophenyl)methyl]thiophen..."
376450,55226893,CC1=CC(=NC=N1)N2CCC(C2)(C(=O)O)C(F)(F)F,1-(6-methylpyrimidin-4-yl)-3-(trifluoromethyl)...


## Tokenization & Vocabulary 

In [None]:
from torchtext.vocab import build_vocab_from_iterator
import spacy

def char_iterator(data):
    for text in data:
        for char in text:
            yield char


vocab_source = build_vocab_from_iterator(char_iterator(df[" SMILES"]), specials=["<s>", "</s>", "<blank>", "<unk>"])
vocab_target = build_vocab_from_iterator(char_iterator(df[" IUPAC"]), specials=["<s>", "</s>", "<blank>", "<unk>"])
print(len(vocab_source))


class CharacterTokenizer:
    def __init__(self, nlp):
        self.nlp = nlp

    def __call__(self, text):
        return [char for char in text]

# Load SpaCy with a blank model
tokenizer = CharacterTokenizer(spacy.blank("en"))


71


## Model architecture

As a start we use the model described in "Attention is all you need", the code is mostly taken from The Annotated Transformer. 
Fundamentally this model consists of two parts, an encoder and a decoder.
The encoder encodes the SMILES string into a latent space, and the decoder produces the IUPAC name from a vector in the latent space. 

In [None]:
import torch.nn as nn
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

Our model is auto-regressive, meaning it generates one character at a time. The generator below accepts the output of the decoder (a vector) and maps it into shape (batch_size, vocab). 
Finally logmax is applied to get probabily for each possible character.

In [None]:
from torch.nn.functional import log_softmax

class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)


### Encoder

In [None]:
import copy
import torch

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

### Decoder

In [None]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

### Attention

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

### Embedding

In [None]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    


### Full model

In [None]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

## Training

In [None]:
class Batch:
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=2):  # 2 = <blank>
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask
    
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # Steps in the current epoch
    accum_step: int = 0  # Number of gradient accumulation steps
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed


def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
):
    """Train a single epoch"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(
            batch.src, batch.tgt, batch.src_mask, batch.tgt_mask
        )
        loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
        # loss_node = loss_node / accum_iter
        if mode == "train" or mode == "train+log":
            loss_node.backward()
            train_state.step += 1
            train_state.samples += batch.src.shape[0]
            train_state.tokens += batch.ntokens
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]
            elapsed = time.time() - start
            print(
                (
                    "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
        del loss
        del loss_node
    return total_loss / total_tokens, train_state

def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )


class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())
    

def loss(x, crit):
    d = x + 3 * 1
    predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d]])
    return crit(predict.log(), torch.LongTensor([1])).data

class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm):
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
            / norm
        )
        return sloss.data * norm, sloss


In [None]:
class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        Noneo


class DummyScheduler:
    def step(self):
        None

In [None]:
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.functional import pad
import time



def collate_batch(
    batch,
    src_pipeline,
    tgt_pipeline,
    src_vocab,
    tgt_vocab,
    device,
    max_padding=128,
    pad_id=2,
):
    bs_id = torch.tensor([0], device=device)  # <s> token id
    eos_id = torch.tensor([1], device=device)  # </s> token id
    src_list, tgt_list = [], []
    for (_src, _tgt) in batch:
        processed_src = torch.cat(
            [
                bs_id,
                torch.tensor(
                    src_vocab(src_pipeline(_src)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        processed_tgt = torch.cat(
            [
                bs_id,
                torch.tensor(
                    tgt_vocab(tgt_pipeline(_tgt)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        src_list.append(
            # warning - overwrites values for negative values of padding - len
            pad(
                processed_src,
                (
                    0,
                    max_padding - len(processed_src),
                ),
                value=pad_id,
            )
        )
        tgt_list.append(
            pad(
                processed_tgt,
                (0, max_padding - len(processed_tgt)),
                value=pad_id,
            )
        )

    src = torch.stack(src_list)
    tgt = torch.stack(tgt_list)
    return (src, tgt)



def collate_batch(
    batch,
    src_pipeline,
    tgt_pipeline,
    src_vocab,
    tgt_vocab,
    device,
    max_padding=128,
    pad_id=2,
):
    bs_id = torch.tensor([0], device=device)  # <s> token id
    eos_id = torch.tensor([1], device=device)  # </s> token id
    src_list, tgt_list = [], []
    for (_src, _tgt) in batch:
        processed_src = torch.cat(
            [
                bs_id,
                torch.tensor(
                    src_vocab(src_pipeline(_src)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        processed_tgt = torch.cat(
            [
                bs_id,
                torch.tensor(
                    tgt_vocab(tgt_pipeline(_tgt)),
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )
        src_list.append(
            # warning - overwrites values for negative values of padding - len
            pad(
                processed_src,
                (
                    0,
                    max_padding - len(processed_src),
                ),
                value=pad_id,
            )
        )
        tgt_list.append(
            pad(
                processed_tgt,
                (0, max_padding - len(processed_tgt)),
                value=pad_id,
            )
        )

    src = torch.stack(src_list)
    tgt = torch.stack(tgt_list)
    return (src, tgt)

def create_dataloaders(
    device,
    vocab_src,
    vocab_tgt,
    batch_size=12000,
    max_padding=128,
    is_distributed=True,
):
    # def create_dataloaders(batch_size=12000):

    def collate_fn(batch):
        return collate_batch(
            batch,
            tokenizer,
            tokenizer,
            vocab_src,
            vocab_tgt,
            device,
            max_padding=max_padding,
            pad_id=vocab_src.get_stoi()["<blank>"],
        )
    

    train_sampler = (
        DistributedSampler(train_dataset) if is_distributed else None
    )
    valid_sampler = (
        DistributedSampler(val_dataset) if is_distributed else None
    )
    test_sampler = (
        DistributedSampler(val_dataset) if is_distributed else None
    )


    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=(train_sampler is None),
        sampler=train_sampler,
        collate_fn=collate_fn,
    )
    valid_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=(valid_sampler is None),
        sampler=valid_sampler,
        collate_fn=collate_fn,
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=(test_sampler is None),
        sampler=test_sampler,
        collate_fn=collate_fn,
    )
    return train_dataloader, valid_dataloader, test_dataloader


def train_worker(
    gpu,
    ngpus_per_node,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    config,
    is_distributed=False,
):
    print(f"Train worker process using GPU: {gpu} for training", flush=True)
    torch.cuda.set_device(gpu)

    pad_idx = vocab_tgt["<blank>"]
    d_model = 512
    model = make_model(len(vocab_src), len(vocab_tgt), N=6)
    model.cuda(gpu)
    module = model
    is_main_process = True
    if is_distributed:
        dist.init_process_group(
            "nccl", init_method="env://", rank=gpu, world_size=ngpus_per_node
        )
        model = DDP(model, device_ids=[gpu])
        module = model.module
        is_main_process = gpu == 0

    criterion = LabelSmoothing(
        size=len(vocab_tgt), padding_idx=pad_idx, smoothing=0.1
    )
    criterion.cuda(gpu)

    train_dataloader, valid_dataloader, _ = create_dataloaders(
        gpu,
        vocab_src,
        vocab_tgt,
        batch_size=config["batch_size"] // ngpus_per_node,
        max_padding=config["max_padding"],
        is_distributed=is_distributed,
    )

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )

    
   
    if "resume_checkpoint" in config:
         checkpoint_path = config["resume_checkpoint"]
         checkpoint = torch.load(checkpoint_path)
         model.load_state_dict(checkpoint["model_state_dict"])
         optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
         lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
         train_state = checkpoint["train_state"]
         start_epoch = checkpoint["epoch"] + 1
         print(f"Resuming training from checkpoint: {checkpoint_path}, epoch {start_epoch}")
    else:
        train_state = TrainState()
        start_epoch = 0


    for epoch in range(start_epoch, start_epoch+config["num_epochs"]):
        if is_distributed:
            train_dataloader.sampler.set_epoch(epoch)
            valid_dataloader.sampler.set_epoch(epoch)

        model.train()
        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)
        _, train_state = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in train_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        GPUtil.showUtilization()
        if is_main_process:
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(
                {
                    "epoch": epoch,
                    "model_state_dict": module.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "lr_scheduler_state_dict": lr_scheduler.state_dict(),
                    "train_state": train_state,
                },
                file_path,
            )
        torch.cuda.empty_cache()

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval()
        sloss = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in valid_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )
        print(sloss)
        torch.cuda.empty_cache()

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)

In [None]:
!pip install torch==1.11.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://download.pytorch.org/whl/cu113
Collecting torch==1.11.0+cu113
  Downloading https://download.pytorch.org/whl/cu113/torch-1.11.0%2Bcu113-cp310-cp310-linux_x86_64.whl (1637.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 GB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0
    Uninstalling torch-1.11.0:
      Successfully uninstalled torch-1.11.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 1.11.0+cu113 which is incompatible.
torchvision 0.15.2+cu118 requires torch==2.0.1, but you have torch 1.11.0+cu113 which is incompatible.[0m[31m
[0mSuccessfully

In [None]:
from os.path import exists
import math
import GPUtil

def train_distributed_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    ngpus = torch.cuda.device_count()
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12356"
    print(f"Number of GPUs detected: {ngpus}")
    print("Spawning training processes ...")
    mp.spawn(
        train_worker,
        nprocs=ngpus,
        args=(ngpus, vocab_src, vocab_tgt, spacy_de, spacy_en, config, True),
    )


def train_model(vocab_src, vocab_tgt, spacy_de, spacy_en, config):
    if config["distributed"]:
        train_distributed_model(
            vocab_src, vocab_tgt, spacy_de, spacy_en, config
        )
    else:
        train_worker(
            0, 1, vocab_src, vocab_tgt, spacy_de, spacy_en, config, False
        )

config = {
        "batch_size": 400,
        "distributed": False,
        "num_epochs": 100,
        "accum_iter": 10,
        "base_lr": 1.0,
        "max_padding": 72,
        "warmup": 3000,
        "file_prefix": "multi30k_model_58.pt",
        # "resume_checkpoint" : "multi30k_model_38.pt"
    }

def load_trained_model(force_train=False):
    model_path = "multi30k_model_58.pt"
    if force_train or not exists(model_path):
      train_model(vocab_source, vocab_target, tokenizer, tokenizer, config)

    model = make_model(len(vocab_source), len(vocab_target), N=6)
    checkpoint = torch.load("multi30k_model_58.pt")
    model.load_state_dict(checkpoint["model_state_dict"])
    return model



model = load_trained_model(force_train=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        out = model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
    return ys

ahoj


In [None]:
from tqdm import tqdm

def check_outputs(
    valid_dataloader,
    model,
    vocab_src,
    vocab_tgt,
    pad_idx=2,
    eos_string="</s>",
):
    correct = 0
    total_examples = len(valid_dataloader)
    i = 0

    with tqdm(total=total_examples, desc='Validation Progress') as pbar:
        for b in valid_dataloader:
            if i % 10:
              print("Samples", str(i), "Accuracy", correct/i)
            rb = Batch(b[0], b[1], pad_idx)

            greedy_decode(model, rb.src, rb.src_mask, 64, 0)[0]

            src_tokens = [
                vocab_src.get_itos()[x] for x in rb.src[0] if x != pad_idx
            ]
            tgt_tokens = [
                vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x != pad_idx
            ]
            target = "".join(tgt_tokens).replace("\n", "")

            model_out = greedy_decode(model, rb.src, rb.src_mask, 72, 0)[0]
            predicted = (
                "".join(
                    [vocab_tgt.get_itos()[x] for x in model_out if x != pad_idx]
                ).split(eos_string, 1)[0]
                + eos_string
            ).replace("\n", "")

            correct += predicted == target
            pbar.update(1)
            if predicted!=target:
              print("\nTARGET:", target)
              print("PREDICTED:", predicted)
            i+=1

    return correct / total_examples

def run_model_example(n_examples=5):
    global vocab_source, vocab_target

    print("Preparing Data ...")
    _, _, valid_dataloader = create_dataloaders(
        torch.device("cpu"),
        vocab_source,
        vocab_target,
        batch_size=1,
        is_distributed=False,
    )

    print("Loading Trained Model ...")

    model = make_model(len(vocab_source), len(vocab_target), N=6)
    checkpoint = torch.load("multi30k_model_58.pt", map_location=torch.device("cpu"))
    model.load_state_dict(checkpoint["model_state_dict"])

    print("Checking Model Outputs:")
    example_data = check_outputs(
        valid_dataloader, model, vocab_source, vocab_target,
    )
    return model, example_data


run_model_example()
print("Pepa")

Preparing Data ...
Loading Trained Model ...
Checking Model Outputs:


Validation Progress:   0%|          | 1/37646 [00:04<44:35:58,  4.27s/it]

Samples 1 Accuracy 1.0


Validation Progress:   0%|          | 2/37646 [00:09<47:33:30,  4.55s/it]

Samples 2 Accuracy 1.0


Validation Progress:   0%|          | 3/37646 [00:14<51:38:29,  4.94s/it]

Samples 3 Accuracy 1.0


Validation Progress:   0%|          | 4/37646 [00:18<49:11:29,  4.70s/it]


TARGET: <s>mercury(1+)</s>
PREDICTED: <s>mercuric(1+)</s>
Samples 4 Accuracy 0.75


Validation Progress:   0%|          | 5/37646 [00:23<47:47:58,  4.57s/it]

Samples 5 Accuracy 0.8


Validation Progress:   0%|          | 6/37646 [00:28<49:07:59,  4.70s/it]

Samples 6 Accuracy 0.8333333333333334


Validation Progress:   0%|          | 7/37646 [00:32<49:05:51,  4.70s/it]

Samples 7 Accuracy 0.8571428571428571


Validation Progress:   0%|          | 8/37646 [00:37<47:49:12,  4.57s/it]

Samples 8 Accuracy 0.875


Validation Progress:   0%|          | 9/37646 [00:41<47:14:46,  4.52s/it]

Samples 9 Accuracy 0.8888888888888888


Validation Progress:   0%|          | 11/37646 [00:50<47:42:35,  4.56s/it]

Samples 11 Accuracy 0.9090909090909091


Validation Progress:   0%|          | 12/37646 [00:55<46:39:27,  4.46s/it]

Samples 12 Accuracy 0.9166666666666666


Validation Progress:   0%|          | 13/37646 [01:00<48:32:38,  4.64s/it]


TARGET: <s>N-(2-methyl-2,3-dihydro-1,5-benzothiazepin-4-yl)hydroxylamine</s>
PREDICTED: <s>N-(2-methyl-3,4-dihydro-2H-1,5-benzothiazepin-4-yl)hydroxylamine</s>
Samples 13 Accuracy 0.8461538461538461


Validation Progress:   0%|          | 14/37646 [01:04<48:10:49,  4.61s/it]

Samples 14 Accuracy 0.8571428571428571


Validation Progress:   0%|          | 15/37646 [01:08<47:08:27,  4.51s/it]

Samples 15 Accuracy 0.8666666666666667


Validation Progress:   0%|          | 16/37646 [01:13<47:07:31,  4.51s/it]


TARGET: <s>neptunium;oxygen(2-)</s>
PREDICTED: <s>oxygen(2-);propane</s>
Samples 16 Accuracy 0.8125


Validation Progress:   0%|          | 17/37646 [01:18<48:37:45,  4.65s/it]

Samples 17 Accuracy 0.8235294117647058


Validation Progress:   0%|          | 18/37646 [01:22<47:38:38,  4.56s/it]


TARGET: <s>methyl 2-(4-amino-5,6-dihydrothieno[2,3-d]pyrimidin-2-yl)acetate</s>
PREDICTED: <s>methyl 2-(6-amino-5,7-dihydrothieno[2,3-d]pyrimidin-4-yl)acetate</s>
Samples 18 Accuracy 0.7777777777777778


Validation Progress:   0%|          | 19/37646 [01:27<46:48:41,  4.48s/it]

Samples 19 Accuracy 0.7894736842105263


Validation Progress:   0%|          | 21/37646 [01:36<48:09:23,  4.61s/it]

Samples 21 Accuracy 0.8095238095238095


Validation Progress:   0%|          | 22/37646 [01:40<47:08:41,  4.51s/it]


TARGET: <s>4-phenylthieno[3,2-c]pyridine</s>
PREDICTED: <s>7-phenylthieno[2,3-c]pyridine</s>
Samples 22 Accuracy 0.7727272727272727


Validation Progress:   0%|          | 23/37646 [01:45<46:46:46,  4.48s/it]

Samples 23 Accuracy 0.782608695652174


Validation Progress:   0%|          | 24/37646 [01:50<49:16:43,  4.72s/it]

Samples 24 Accuracy 0.7916666666666666


Validation Progress:   0%|          | 25/37646 [01:54<47:45:31,  4.57s/it]

Samples 25 Accuracy 0.8


Validation Progress:   0%|          | 26/37646 [01:58<46:55:43,  4.49s/it]

Samples 26 Accuracy 0.8076923076923077


Validation Progress:   0%|          | 27/37646 [02:04<48:51:31,  4.68s/it]

Samples 27 Accuracy 0.8148148148148148


Validation Progress:   0%|          | 28/37646 [02:08<48:02:20,  4.60s/it]

Samples 28 Accuracy 0.8214285714285714


Validation Progress:   0%|          | 29/37646 [02:12<47:11:07,  4.52s/it]

Samples 29 Accuracy 0.8275862068965517


Validation Progress:   0%|          | 31/37646 [02:22<48:36:11,  4.65s/it]

Samples 31 Accuracy 0.8387096774193549


Validation Progress:   0%|          | 32/37646 [02:26<47:20:58,  4.53s/it]

Samples 32 Accuracy 0.84375


Validation Progress:   0%|          | 33/37646 [02:30<46:29:39,  4.45s/it]

Samples 33 Accuracy 0.8484848484848485


Validation Progress:   0%|          | 34/37646 [02:35<47:33:02,  4.55s/it]

Samples 34 Accuracy 0.8529411764705882


Validation Progress:   0%|          | 35/37646 [02:40<47:34:04,  4.55s/it]


TARGET: <s>(2R)-3-amino-2-(methylamino)-3-oxo-propanedithioic acid</s>
PREDICTED: <s>(2R)-2-(methylamino)-2-oxo-ethanedithioic acid</s>
Samples 35 Accuracy 0.8285714285714286


Validation Progress:   0%|          | 36/37646 [02:44<46:32:27,  4.45s/it]

Samples 36 Accuracy 0.8333333333333334


Validation Progress:   0%|          | 37/37646 [02:48<46:29:49,  4.45s/it]

Samples 37 Accuracy 0.8378378378378378


Validation Progress:   0%|          | 38/37646 [02:53<48:37:17,  4.65s/it]

Samples 38 Accuracy 0.8421052631578947


Validation Progress:   0%|          | 39/37646 [02:58<47:27:19,  4.54s/it]

Samples 39 Accuracy 0.8461538461538461


Validation Progress:   0%|          | 41/37646 [03:07<49:00:07,  4.69s/it]

Samples 41 Accuracy 0.8536585365853658


Validation Progress:   0%|          | 42/37646 [03:12<47:55:27,  4.59s/it]


TARGET: <s>5-chloro-N-[(5-iodo-2-furyl)methyl]-2-nitro-aniline</s>
PREDICTED: <s>5-chloro-4-[(5-iodo-2-furyl)methyl]-2-nitro-aniline</s>
Samples 42 Accuracy 0.8333333333333334


Validation Progress:   0%|          | 43/37646 [03:16<46:55:39,  4.49s/it]


TARGET: <s>6-oxa-2-azabicyclo[3.1.0]hexan-4-one</s>
PREDICTED: <s>6-oxa-2-azabicyclo[3.1.0]hexan-3-one</s>
Samples 43 Accuracy 0.813953488372093


Validation Progress:   0%|          | 44/37646 [03:21<48:08:26,  4.61s/it]

Samples 44 Accuracy 0.8181818181818182


Validation Progress:   0%|          | 45/37646 [03:25<48:22:52,  4.63s/it]

Samples 45 Accuracy 0.8222222222222222


Validation Progress:   0%|          | 46/37646 [03:30<47:11:41,  4.52s/it]


TARGET: <s>2-thioxo-1-[2-(trifluoromethoxy)phenyl]pyrimidin-4-one</s>
PREDICTED: <s>1-[2-(trifluoromethoxy)phenyl]-2-thioxo-pyrimidin-4-one</s>
Samples 46 Accuracy 0.8043478260869565


Validation Progress:   0%|          | 47/37646 [03:34<46:46:36,  4.48s/it]

Samples 47 Accuracy 0.8085106382978723


Validation Progress:   0%|          | 48/37646 [03:39<48:16:40,  4.62s/it]

Samples 48 Accuracy 0.8125


Validation Progress:   0%|          | 49/37646 [03:43<47:18:28,  4.53s/it]

Samples 49 Accuracy 0.8163265306122449


Validation Progress:   0%|          | 50/37646 [03:48<46:26:45,  4.45s/it]


TARGET: <s>6,9-dihydrodibenzofuran-3-amine</s>
PREDICTED: <s>4,9-dihydrodibenzofuran-2-amine</s>


Validation Progress:   0%|          | 51/37646 [03:53<48:13:50,  4.62s/it]

Samples 51 Accuracy 0.803921568627451


Validation Progress:   0%|          | 52/37646 [03:57<48:25:22,  4.64s/it]

Samples 52 Accuracy 0.8076923076923077


Validation Progress:   0%|          | 53/37646 [04:02<49:11:27,  4.71s/it]


TARGET: <s>1-(dideuteriomethyl)-3-methoxy-2-nitro-benzene</s>
PREDICTED: <s>1-deuterio-3-[deuterio(nitro)methyl]-2-methoxy-benzene</s>
Samples 53 Accuracy 0.7924528301886793


Validation Progress:   0%|          | 54/37646 [04:07<50:34:35,  4.84s/it]

Samples 54 Accuracy 0.7962962962962963


Validation Progress:   0%|          | 55/37646 [04:12<49:47:09,  4.77s/it]


TARGET: <s>(2S,3R,4S)-2-[(1S)-1,2-dihydroxyethyl]pyrrolidine-3,4-diol</s>
PREDICTED: <s>(2R,3S,4S)-2-[(1R)-1,2-dihydroxyethyl]pyrrolidine-3,4-diol</s>
Samples 55 Accuracy 0.7818181818181819


Validation Progress:   0%|          | 56/37646 [04:16<48:05:06,  4.61s/it]


TARGET: <s>3-isopropyl-4,5-dimethyl-aniline</s>
PREDICTED: <s>4-isopropyl-2,3-dimethyl-aniline</s>
Samples 56 Accuracy 0.7678571428571429


Validation Progress:   0%|          | 57/37646 [04:21<47:54:16,  4.59s/it]


TARGET: <s>(2R,3S,4S)-3,4-dihydroxy-2-methyl-pyrrolidine-2-carboxylic acid</s>
PREDICTED: <s>(2R,3S,4R)-3,4-dihydroxy-2-methyl-pyrrolidine-2-carboxylic acid</s>
Samples 57 Accuracy 0.7543859649122807


Validation Progress:   0%|          | 58/37646 [04:26<49:38:43,  4.75s/it]

Samples 58 Accuracy 0.7586206896551724


Validation Progress:   0%|          | 59/37646 [04:30<48:05:30,  4.61s/it]


TARGET: <s>2-cyano-N-methyl-N-(2-methylbutyl)acetamide</s>
PREDICTED: <s>2-cyano-N-(2-methylbutyl)-N-methyl-acetamide</s>
Samples 59 Accuracy 0.7457627118644068


Validation Progress:   0%|          | 61/37646 [04:39<48:06:35,  4.61s/it]

Samples 61 Accuracy 0.7540983606557377


Validation Progress:   0%|          | 62/37646 [04:44<47:52:45,  4.59s/it]


TARGET: <s>N,2-dimethylbutanediamide</s>
PREDICTED: <s>N',2-dimethylbutanediamide</s>
Samples 62 Accuracy 0.7419354838709677


Validation Progress:   0%|          | 63/37646 [04:48<46:44:39,  4.48s/it]

Samples 63 Accuracy 0.746031746031746


Validation Progress:   0%|          | 64/37646 [04:52<46:54:23,  4.49s/it]


TARGET: <s>[2-(1,3-dithiol-2-ylidene)-1-methyl-ethyl]sulfonyl butanoate</s>
PREDICTED: <s>1,2-bis(1,3-dithiol-2-ylidene)ethyl butanoate</s>
Samples 64 Accuracy 0.734375


Validation Progress:   0%|          | 65/37646 [04:57<48:20:55,  4.63s/it]

Samples 65 Accuracy 0.7384615384615385


Validation Progress:   0%|          | 66/37646 [05:02<47:12:59,  4.52s/it]


TARGET: <s>3-amino-N-tert-butoxy-N,2,2-trimethyl-propanamide</s>
PREDICTED: <s>3-amino-N-tert-butoxy-N-methyl-2-methyl-propanamide</s>
Samples 66 Accuracy 0.7272727272727273


Validation Progress:   0%|          | 67/37646 [05:06<46:24:25,  4.45s/it]

Samples 67 Accuracy 0.7313432835820896


Validation Progress:   0%|          | 68/37646 [05:11<47:50:17,  4.58s/it]

Samples 68 Accuracy 0.7352941176470589


Validation Progress:   0%|          | 69/37646 [05:15<47:22:20,  4.54s/it]

Samples 69 Accuracy 0.7391304347826086


Validation Progress:   0%|          | 71/37646 [05:25<48:20:18,  4.63s/it]

Samples 71 Accuracy 0.7464788732394366


Validation Progress:   0%|          | 72/37646 [05:29<48:26:41,  4.64s/it]


TARGET: <s>(1R,4S)-5,6-dimethyl-2-azabicyclo[2.2.1]heptan-3-one</s>
PREDICTED: <s>(1R,4R)-4,5-dimethyl-2-azabicyclo[2.2.1]heptan-2-one</s>
Samples 72 Accuracy 0.7361111111111112


Validation Progress:   0%|          | 73/37646 [05:34<47:13:18,  4.52s/it]

Samples 73 Accuracy 0.7397260273972602


Validation Progress:   0%|          | 74/37646 [05:38<46:38:58,  4.47s/it]


TARGET: <s>4-hydroxy-2,3,4,5-tetrahydro-2-benzazepin-1-one</s>
PREDICTED: <s>3-hydroxy-2,5,6,8-tetrahydro-3-2-benzazazepin-1-one</s>
Samples 74 Accuracy 0.7297297297297297


Validation Progress:   0%|          | 75/37646 [05:43<48:57:00,  4.69s/it]

Samples 75 Accuracy 0.7333333333333333


Validation Progress:   0%|          | 76/37646 [05:47<47:24:43,  4.54s/it]

Samples 76 Accuracy 0.7368421052631579


Validation Progress:   0%|          | 77/37646 [05:51<46:20:49,  4.44s/it]

Samples 77 Accuracy 0.7402597402597403


Validation Progress:   0%|          | 78/37646 [05:56<47:24:29,  4.54s/it]

Samples 78 Accuracy 0.7435897435897436


Validation Progress:   0%|          | 79/37646 [06:01<47:42:20,  4.57s/it]

Samples 79 Accuracy 0.7468354430379747


Validation Progress:   0%|          | 81/37646 [06:10<47:03:42,  4.51s/it]

Samples 81 Accuracy 0.7530864197530864


Validation Progress:   0%|          | 82/37646 [06:15<48:51:14,  4.68s/it]

Samples 82 Accuracy 0.7560975609756098


Validation Progress:   0%|          | 83/37646 [06:19<47:31:34,  4.55s/it]

Samples 83 Accuracy 0.7590361445783133


Validation Progress:   0%|          | 84/37646 [06:23<46:37:19,  4.47s/it]


TARGET: <s>2-(1,3-dithian-2-ylidene)tetralin-1-one</s>
PREDICTED: <s>2-(1,3-dithian-2-ylidene)indan-1-one</s>
Samples 84 Accuracy 0.75


Validation Progress:   0%|          | 85/37646 [06:29<49:37:32,  4.76s/it]

Samples 85 Accuracy 0.7529411764705882


Validation Progress:   0%|          | 86/37646 [06:33<48:05:31,  4.61s/it]

Samples 86 Accuracy 0.7558139534883721


Validation Progress:   0%|          | 87/37646 [06:37<46:57:22,  4.50s/it]

Samples 87 Accuracy 0.7586206896551724


Validation Progress:   0%|          | 88/37646 [06:42<47:34:56,  4.56s/it]

Samples 88 Accuracy 0.7613636363636364


Validation Progress:   0%|          | 89/37646 [06:47<47:52:15,  4.59s/it]

Samples 89 Accuracy 0.7640449438202247


Validation Progress:   0%|          | 91/37646 [06:55<46:32:45,  4.46s/it]


TARGET: <s>3-(3,5-dinitro-1,2,4-triazol-1-yl)propanoic acid</s>
PREDICTED: <s>3-(4,5-dinitro-1,2,4-triazol-1-yl)propanoic acid</s>
Samples 91 Accuracy 0.7582417582417582


Validation Progress:   0%|          | 91/37646 [06:58<47:58:45,  4.60s/it]


KeyboardInterrupt: ignored