引入所需库函数

In [None]:
import torch
from torch import Tensor

from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

import torchvision
from torchvision import transforms as tvt

from torchsummary import summary

from torchtext.data.metrics import bleu_score


import pytorch_lightning as pl

from abc import ABC, abstractmethod

from evaluate import load

import json
import re

import numpy as np
import pandas as pd

import os
import glob

对img2latex100k数据集的词典进行定义


将公式分割成标记序列（token），将标记序列（token）转换为整数序列

In [None]:
class Text(ABC):
    def __init__(self):
        self.pad_id = 0
        self.sos_id = 1
        self.eos_id = 2

    @abstractmethod
    def tokenize(self, formula: str):
        pass

    def int2text(self, x: Tensor):
        return " ".join([self.id2word[i] for i in x if i > self.eos_id])

    def text2int(self, formula: str):
        return torch.LongTensor([self.word2id[i] for i in self.tokenize(formula)])


class Text100k(Text):
    def __init__(self):
        super().__init__()
        self.id2word = json.load(open("./data/vocab/100k_vocab.json", "r"))
        self.word2id = dict(zip(self.id2word, range(len(self.id2word))))
        self.TOKENIZE_PATTERN = re.compile(
            "(\\\\[a-zA-Z]+)|" + '((\\\\)*[$-/:-?{-~!"^_`\[\]])|' + "(\w)|" + "(\\\\)"
        )
        self.n_class = len(self.id2word)

    def tokenize(self, formula: str):
        tokens = re.finditer(self.TOKENIZE_PATTERN, formula)
        tokens = list(map(lambda x: x.group(0), tokens))
        tokens = [x for x in tokens if x is not None and x != ""]
        return tokens

加载训练和预测所需的数据

In [None]:
class LatexDataset(Dataset):
    def __init__(
        self, data_path, img_path, data_type: str, n_sample: int = None, dataset="100k"
    ):
        super().__init__()
        assert data_type in ["train", "test", "validate"], "Not found data type"
        csv_path = data_path + f"/im2latex_{data_type}.csv"
        df = pd.read_csv(csv_path)
        if n_sample:
            df = df.head(n_sample)
        df["image"] = df.image.map(lambda x: img_path + "/" + x)
        self.walker = df.to_dict("records")
        # self.transform = tvt.Compose([tvt.Normalize((0.5), (0.5)),])
        self.transform = tvt.Compose([tvt.Grayscale(),])

    def __len__(self):
        return len(self.walker)

    def __getitem__(self, idx):
        item = self.walker[idx]

        formula = item["formula"]
        image = torchvision.io.read_image(item["image"])
        image = image.to(dtype=torch.float)
        image /= image.max()
        image = self.transform(image)  # transform image to [-1, 1]
        return image, formula, os.path.basename(item['image'])

class LatexPredictDataset(Dataset):
    def __init__(self, predict_img_path: str):
        super().__init__()
        if predict_img_path:
            assert os.path.exists(predict_img_path), "Image not found"
            self.walker = glob.glob(predict_img_path + '/*.png')

#             self.walker = [predict_img_path]
        else:
            self.walker = glob.glob(predict_img_path + '/*.png')
#         self.transform = tvt.Compose([tvt.Normalize((0.5), (0.5)),])
        self.transform = tvt.Compose([tvt.Grayscale(),])

    def __len__(self):
        return len(self.walker)

    def __getitem__(self, idx):
        img_path = self.walker[idx]

        image = torchvision.io.read_image(img_path)
        image = image.to(dtype=torch.float)
        image /= image.max()
        image = self.transform(image)  # transform image to [-1, 1]

        return image, os.path.basename(img_path)

组织训练、验证、测试和预测的数据加载

collate_fn对一个批次的数据进行处理和组织

接受一个批次的数据样本列表

返回元组 (image, formula, image_name)

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_set,
        val_set,
        test_set,
        predict_set,
        num_workers: int = 1,
        batch_size=20,
        text=None,
    ):
        super().__init__()
        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.predict_set = predict_set
        self.batch_size = batch_size
        self.text = text
        self.num_workers = num_workers

    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            shuffle=True,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            drop_last=True,
            num_workers=self.num_workers,
            persistent_workers=False,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_set,
            shuffle=False,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            num_workers=self.num_workers,
            persistent_workers=False,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            shuffle=False,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            num_workers=self.num_workers,
            persistent_workers=False,
        )

    def predict_dataloader(self):
        return DataLoader(self.predict_set, shuffle=False, batch_size=1,)

    def collate_fn(self, batch):
        size = len(batch)
        
        formulas = [self.text.text2int(i[1]) for i in batch]
        formula_len = torch.LongTensor([i.size(-1) + 1 for i in formulas])
        formulas = pad_sequence(formulas, batch_first=True)
        sos = torch.zeros(size, 1) + self.text.word2id["<s>"]
        eos = torch.zeros(size, 1) + self.text.word2id["<e>"]
        formulas = torch.cat((sos, formulas, eos), dim=-1).to(dtype=torch.long)

        images = [i[0] for i in batch]
        
        images_name = [i[-1] for i in batch]
        max_width, max_height = 0, 0
        for img in images:
            c, h, w = img.size()
            max_width = max(max_width, w)
            max_height = max(max_height, h)

        def padding(img):
            c, h, w = img.size()
            padder = tvt.Pad((0, 0, max_width - w, max_height - h))
            return padder(img)

        images = torch.stack(list(map(padding, images))).to(dtype=torch.float)
        
#         images = torch.stack(images).to(dtype=torch.float)
        return images, formulas, formula_len, images_name

模型部分

encoder层

cnn编码结合positional编码

In [None]:
class ConvEncoderWithPE(nn.Module):
    def __init__(self, enc_dim:int, drop_out: float=0.1):
        super(ConvEncoderWithPE, self).__init__()
        self.fe = nn.Sequential(
                nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
                nn.MaxPool2d(2, stride=2, padding=0),
                nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
                nn.MaxPool2d(2, stride=2, padding=0),
                nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(256),
                nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
                nn.MaxPool2d(kernel_size=(1,2), stride=(1,2), padding=0),
                nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(512),
                nn.MaxPool2d(kernel_size=(2,1), stride=(2,1), padding=0),
                nn.Conv2d(512, 512, kernel_size=(3,3), stride=1, padding=1),
                nn.BatchNorm2d(512)
        )
        self.dropout = nn.Dropout(drop_out)
        self.enc_dim = enc_dim
        
        self.batch_norm = nn.BatchNorm2d(512)
        
        self.div_term = []
        for i in range(0, self.enc_dim+4, 4):
            self.div_term += [i] * 2
        self.div_term = self.div_term * 2
        self.div_term = list(map(lambda x: float(x)/float(512), self.div_term))
        self.div_term = torch.tensor(self.div_term)
        self.div_term = torch.pow(10000, self.div_term)
        
        self.half_enc_dim = int(self.enc_dim / 2)
            
        
    def forward(self, x: Tensor):
        """x: Tensor (bs, h, w, c)"""
        """Return tensor size (bs, -1, c)"""
        fc = self.fe(x)
        
        fc = fc.permute(0,3,2,1)
        
        bs, h, w, c = fc.size()
        
        pe = torch.zeros(h, w, c).to(device=x.device)
        
        x_pos = torch.arange(0, w).unsqueeze(1).to(device=x.device)
        y_pos = torch.arange(0, h).unsqueeze(1).to(device=x.device)
        y_pos = y_pos.repeat(1, w).unsqueeze(-1).to(device=x.device)
    
        pe[:,:,0:self.half_enc_dim] = x_pos
        pe[:,:,self.half_enc_dim:] = y_pos
        
        pe = pe / self.enc_dim

        pe[:,:,0::2] = torch.sin(pe[:,:,0::2]).to(device=x.device)
        pe[:,:,1::2] = torch.cos(pe[:,:,1::2]).to(device=x.device)
        
        fc = fc + pe
        
        return self.dropout(fc).reshape(bs, -1, c)     

注意力层

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_dim: int = 512, dec_dim: int = 512, attn_dim: int = 512):
        super().__init__()
        self.dec_attn = nn.Linear(dec_dim, attn_dim, bias=False)
        self.enc_attn = nn.Linear(enc_dim, attn_dim, bias=False)
        self.full_attn = nn.Linear(attn_dim, 1, bias=False)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, h: Tensor, V: Tensor):
        """
            input:
                h: (b, dec_dim) hidden state vector of decoder
                V: (b, w * h, enc_dim) encoder matrix representation
            output:
                context: (b, enc_dim)
        """

        attn_1 = self.dec_attn(h) # attn_1: (b, attn_dim)
        attn_2 = self.enc_attn(V) # attn_2: (b, w * h, attn_dim)
                
        # attn: (b, w*h)
        attn = self.full_attn(torch.tanh(attn_1.unsqueeze(1) + attn_2)).squeeze(2)
        
        # alpha: (b, w*h) -> each sum elements sum = 1
        alpha = self.softmax(attn)
        
        # (b, w*h, 1) * (b, w*h, dec_dim) = (b, w*h, dec_dim).sum(dim=1) -> (b, dec_dim)
        context = (alpha.unsqueeze(2) * V).sum(dim=1)
        return context


decoder层

双层双向LSTM结合注意力机制

In [None]:
import torch
from torch import nn
from image2latex.attention import Attention


class Decoder(nn.Module):
    def __init__(
        self,
        n_class: int,
        emb_dim: int = 80,
        enc_dim: int = 512,
        dec_dim: int = 512,
        attn_dim: int = 512,
        num_layers: int = 1,
        dropout: float = 0.1,
        bidirectional: bool = False,
        sos_id: int = 1,
        eos_id: int = 2,
    ):
        super().__init__()
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.embedding = nn.Embedding(n_class, emb_dim)
        self.attention = Attention(enc_dim, dec_dim, attn_dim)
        self.concat = nn.Linear(emb_dim + enc_dim, dec_dim)
        self.rnn = nn.LSTM(
            dec_dim,
            dec_dim,
            num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        self.dropout = nn.Dropout(dropout)
        self.rnn2 = nn.LSTM(
            dec_dim,
            dec_dim,
            num_layers=1,
            batch_first=True,
            bidirectional = bidirectional,
        )
        # self.layernorm = nn.LayerNorm((dec_dim))
        self.out = nn.Linear(dec_dim, n_class)
        self.logsoftmax = nn.LogSoftmax(dim=-1)

        self.apply(self.init_weights)

    def init_weights(self, layer):
        if isinstance(layer, nn.Embedding):
            nn.init.orthogonal_(layer.weight)
        elif isinstance(layer, nn.LSTM):
            for name, param in self.rnn.named_parameters():
                if name.startswith("weight"):
                    nn.init.orthogonal_(param)

    def forward(self, y, encoder_out=None, hidden_state=None):
        """
            input:
                y: (bs, target_len)
                h: (bs, dec_dim)
                V: (bs, enc_dim, w, h)
        """

        
        h, c = hidden_state
        embed = self.embedding(y)
        attn_context = self.attention(h, encoder_out)

        rnn_input = torch.cat([embed[:, -1], attn_context], dim=1)
        rnn_input = self.concat(rnn_input)

        rnn_input = rnn_input.unsqueeze(1)
        hidden_state = h.unsqueeze(0), c.unsqueeze(0)
        
        out, hidden_state = self.rnn(rnn_input, hidden_state)
        
        out = self.dropout(out)
        
        out, hidden_state = self.rnn2(out, hidden_state)
        out = self.logsoftmax(self.out(out))
        h, c = hidden_state
        return out, (h.squeeze(0), c.squeeze(0))

Image2Latex模型定义

整合encoder，decoder层

In [None]:
class Image2Latex(nn.Module):
    def __init__(
        self,
        n_class: int,
        enc_dim: int = 512,
        enc_type: str = "conv_row_encoder",
        emb_dim: int = 80,
        dec_dim: int = 512,
        attn_dim: int = 512,
        num_layers: int = 1,
        dropout: float = 0.1,
        bidirectional: bool = False,
        decode_type: str = "greedy",
        text: Text = None,
        beam_width: int = 5,
        sos_id: int = 1,
        eos_id: int = 2,
    ):
            
        
        super().__init__()
        self.n_class = n_class
        self.encoder = ConvEncoderWithPE(enc_dim=enc_dim)
        enc_dim = self.encoder.enc_dim
        self.num_layers = num_layers
        self.decoder = Decoder(
            n_class=n_class,
            emb_dim=emb_dim,
            dec_dim=dec_dim,
            enc_dim=enc_dim,
            attn_dim=attn_dim,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=bidirectional,
            sos_id=sos_id,
            eos_id=eos_id,
        )
        self.init_h = nn.Linear(enc_dim, dec_dim)
        self.init_c = nn.Linear(enc_dim, dec_dim)
        assert decode_type in ["greedy", "beamsearch"]
        self.decode_type = decode_type
        self.text = text
        self.beam_width = beam_width

    def init_decoder_hidden_state(self, V: Tensor):
        """
            return (h, c)
        """
        # V has size (bs, -1, d)
        
        encoder_mean = V.mean(dim=1)
        h = torch.tanh(self.init_h(encoder_mean))
        c = torch.tanh(self.init_c(encoder_mean))
        return h, c

    def forward(self, x: Tensor, y: Tensor, y_len: Tensor):
        encoder_out = self.encoder(x)

        hidden_state = self.init_decoder_hidden_state(encoder_out)

        predictions = []
        for t in range(y_len.max().item()):
            dec_input = y[:, t].unsqueeze(1)
            out, hidden_state = self.decoder(dec_input, encoder_out, hidden_state)
            predictions.append(out.squeeze(1))

        predictions = torch.stack(predictions, dim=1)
        return predictions

    def decode(self, x: Tensor, max_length: int = 150):
        predict = self.decode_beam_search(x, max_length)
        return self.text.int2text(predict)

    def decode_beam_search(self, x: Tensor, max_length: int = 150):
        """
            default: batch size equal to 1
        """
        encoder_out = self.encoder(x)
        bs = encoder_out.size(0)  # 1

        hidden_state = self.init_decoder_hidden_state(encoder_out)

        list_candidate = [
            ([self.decoder.sos_id], hidden_state, 0)
        ]  # (input, hidden_state, log_prob)
        for t in range(max_length):
            new_candidates = []
            for inp, state, log_prob in list_candidate:
                # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                y = torch.LongTensor([inp[-1]]).view(bs, -1).to(device=x.device)
                out, hidden_state = self.decoder(y, encoder_out, state)

                topk = out.topk(self.beam_width)
                
                new_log_prob = topk.values.view(-1).tolist()
                
                new_idx = topk.indices.view(-1).tolist()
                for val, idx in zip(new_log_prob, new_idx):
                    new_inp = inp + [idx]
                    new_candidates.append((new_inp, hidden_state, log_prob + val))

            new_candidates = sorted(new_candidates, key=lambda x: x[2], reverse=True)
            list_candidate = new_candidates[: self.beam_width]

        return list_candidate[0][0]


总体模型

定义优化、训练、验证、测试、预测过程

In [None]:
class Image2LatexModel(pl.LightningModule):
    def __init__(
        self,
        lr,
        total_steps,
        n_class: int,
        enc_dim: int = 512,
        enc_type: str = "conv_row_encoder",
        emb_dim: int = 80,
        dec_dim: int = 512,
        attn_dim: int = 512,
        num_layers: int = 1,
        dropout: float = 0.1,
        bidirectional: bool = False,
        decode_type: str = "greedy",
        text: Text = None,
        beam_width: int = 5,
        sos_id: int = 1,
        eos_id: int = 2,
        log_step: int = 100,
        log_text: bool = False,
    ):
        super().__init__()
        self.model = Image2Latex(
            n_class,
            enc_dim,
            enc_type,
            emb_dim,
            dec_dim,
            attn_dim,
            num_layers,
            dropout,
            bidirectional,
            decode_type,
            text,
            beam_width,
            sos_id,
            eos_id,
        )
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr
        self.total_steps = total_steps
        self.text = text
        self.max_length = 150
        self.log_step = log_step
        self.log_text = log_text
        self.exact_match = load("exact_match")
        self.save_hyperparameters()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, betas=(0.9, 0.98))
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=self.lr, total_steps=self.total_steps, verbose=False,
        )
        scheduler = {
            "scheduler": scheduler,
            "interval": "step",  # or 'epoch'
            "frequency": 1,
        }
        return [optimizer], [scheduler]

    def forward(self, images, formulas, formula_len):
        return self.model(images, formulas, formula_len)

    def training_step(self, batch, batch_idx):
        if batch_idx % 10 == 0:
            torch.cuda.empty_cache()

        images, formulas, formula_len, images_name = batch

        formulas_in = formulas[:, :-1]
        formulas_out = formulas[:, 1:]

        outputs = self.model(images, formulas_in, formula_len)

        bs, t, _ = outputs.size()
        _o = outputs.reshape(bs * t, -1)
        _t = formulas_out.reshape(-1)
        loss = self.criterion(_o, _t)

        self.log("train loss", loss, sync_dist=True)

        return loss

    def validation_step(self, batch, batch_idx):
        images, formulas, formula_len, images_name = batch

        formulas_in = formulas[:, :-1]
        formulas_out = formulas[:, 1:]

        outputs = self.model(images, formulas_in, formula_len)

        bs, t, _ = outputs.size()
        _o = outputs.reshape(bs * t, -1)
        _t = formulas_out.reshape(-1)

        loss = self.criterion(_o, _t)

        predicts = [
            self.text.tokenize(self.model.decode(i.unsqueeze(0), self.max_length))
            for i in images
        ]
        truths = [self.text.tokenize(self.text.int2text(i)) for i in formulas]

        bleu4 = torch.mean(
            torch.Tensor(
                [bleu_score([pre], [[tru]]) for pre, tru in zip(predicts, truths)]
            )
        )

        em = torch.mean(
            torch.Tensor(
                [
                    torch.tensor(
                        self.exact_match.compute(
                            predictions=[" ".join(pre)], references=[" ".join(tru)]
                        )["exact_match"]
                    )
                    for pre, tru in zip(predicts, truths)
                ]
            )
        )

        if self.log_text and batch_idx % self.log_step == 0:
            for truth, pred in zip(truths, predicts):
                print("=" * 20)
                print(f"Truth: [{' '.join(truth)}] | Predict: [{' '.join(pred)}]")
                print("=" * 20)
            print()

        self.log("val_loss", loss, sync_dist=True)
        self.log("val_bleu4", bleu4, sync_dist=True)
        self.log("val_exact_match", em, sync_dist=True)

        return bleu4, em, loss

    def test_step(self, batch, batch_idx):
        images, formulas, formula_len, images_name = batch

        formulas_in = formulas[:, :-1]
        formulas_out = formulas[:, 1:]

        outputs = self.model(images, formulas_in, formula_len)

        bs, t, _ = outputs.size()
        _o = outputs.reshape(bs * t, -1)
        _t = formulas_out.reshape(-1)

        loss = self.criterion(_o, _t)

        predicts = [
            self.text.tokenize(self.model.decode(i.unsqueeze(0), self.max_length))
            for i in images
        ]
        truths = [self.text.tokenize(self.text.int2text(i)) for i in formulas]

        bleu4 = torch.mean(
            torch.Tensor(
                [bleu_score([pre], [[tru]]) for pre, tru in zip(predicts, truths)]
            )
        )

        em = torch.mean(
            torch.Tensor(
                [
                    torch.tensor(
                        self.exact_match.compute(
                            predictions=[" ".join(pre)], references=[" ".join(tru)]
                        )["exact_match"]
                    )
                    for pre, tru in zip(predicts, truths)
                ]
            )
        )

        if True and batch_idx % self.log_step == 0:
            for truth, pred in zip(truths, predicts):
                print("=" * 20)
                print(f"Truth: [{' '.join(truth)}] | Predict: [{' '.join(pred)}]")
                print("=" * 20)
            print()

        self.log("test_loss", loss, sync_dist=True)
        self.log("test_bleu4", bleu4, sync_dist=True)
        self.log("test_exact_match", em, sync_dist=True)

        return bleu4, em, loss

    # Do things u want here at predict step
    def predict_step(self, batch, batch_idx):
        image = batch

        latex = self.model.decode(image, self.max_length)

        print("Predicted:", latex)

        return latex



训练代码

In [None]:
emb_dim = 80
dec_dim = 256
enc_dim = 512
attn_dim = 256

num_layers = 1
drop_out = 0.2
decode = "beamsearch"
beam_width=5
accumulate_batch = 64

text = Text100k()

predict_set = LatexPredictDataset('./samples')

dm = DataModule(None, None, None, predict_set, text=Text100k())

random_state=12
torch.manual_seed(random_state)
np.random.seed(random_state)

data_path = './data/input/im2latex-sorted-by-size'
img_path = './data/input/image2latex-100k/formula_images_processed/formula_images_processed/'

train_set = LatexDataset(
    data_path=data_path,
    img_path=img_path,
    data_type="train",
    n_sample=None,
    dataset="100k",
)
val_set = LatexDataset(
    data_path=data_path,
    img_path=img_path,
    data_type="validate",
    n_sample=None,
    dataset="100k",
)
test_set = LatexDataset(
    data_path=data_path,
    img_path=img_path,
    data_type="test",
    n_sample=None,
    dataset="100k",
)

# Change predict_set to a single image (one at a time)
predict_set = LatexPredictDataset(predict_img_path=img_path + '/6968dfca15.png')

lr = 0.001
max_length = 150
log_idx = 300

max_epochs = 5
batch_size = 16

steps_per_epoch = round(len(train_set) / batch_size)
total_steps = steps_per_epoch * max_epochs

num_workers = 0

dm = DataModule(
    train_set,
    val_set,
    test_set,
    predict_set,
    num_workers,
    batch_size,
    text,
)

model = Image2LatexModel(
    total_steps=total_steps,
    lr=lr,
    n_class=text.n_class,
    enc_dim=enc_dim,
    enc_type="conv_encoder",
    emb_dim=emb_dim,
    dec_dim=dec_dim,
    attn_dim=attn_dim,
    num_layers=num_layers,
    dropout=drop_out,
    sos_id=text.sos_id,
    eos_id=text.eos_id,
    decode_type="beamsearch",
    text=text,
    beam_width=beam_width,
    log_step=100,
    log_text="store_true",
    # logger=pl.loggers.TensorBoardLogger('logs/')
)


lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")

max_epoch=15
# ckpt_path = './data/input/models/best_model.ckpt'
# ckpt_path = "D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/data/input/models/lightning_logs/version_56/checkpoints/epoch=2-step=3504.ckpt"
# ckpt_path = "D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt"

accumulate_grad_batches = accumulate_batch //batch_size



Using the latest cached version of the module from C:\Users\16096\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--exact_match\9d3b67e0c429cd7460b2b05aab53419b48eea369b73e1d9f185a56ca90c373d4 (last modified on Sun Dec 17 16:26:11 2023) since it couldn't be found locally at evaluate-metric--exact_match, or remotely on the Hugging Face Hub.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4080 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision




Restoring states from the checkpoint path at D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/data/input/models/lightning_logs/version_56/checkpoints/epoch=2-step=3504.ckpt
d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:347: The dirpath has changed from './data/input/models/lightning_logs\\version_56\\checkpoints' to './data/input/models/lightning_logs\\version_57\\checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | Image2Latex      | 6.3 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.370    Total estimated model params size (

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Truth: [{ \partial } _ { \mu } G ^ { { \mu } { \nu } { \sigma } } = 0 ,] | Predict: [\partial _ { \mu } F ^ { \mu \nu \nu } = 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 .]
Truth: [{ \cal X } _ { \alpha } L = D \Omega _ { \alpha } \, ,] | Predict: [{ \cal L } _ { a } { \cal L } = { \cal D } _ { \alpha } , { \cal L } = { \cal D } _ { \alpha } , \quad { \cal D } _ { \alpha } , { \cal D } _ { \alpha } , \quad { \cal D } _ { a } , \quad { \cal D } _ { \alpha } , \quad { \cal D } _ { a } ,]
Truth: [\Delta _ { \omega } ~ + ~ \Delta _ { \dot { \omega } } ~ = ~ 0] | Predict: [\Delta _ { a } \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \end { a r r a y } { c } { 0 } \ \ \ { 0 } & { a } \ \ \ { 0 } & { a } \ \ \end { a r r a y } { c } { 0 } \ \ { 0 } & { a } \ \ { 0 } & { a } \ \ { 0 } & { a } \ \ \end { a r r a y } \right )]
Truth: [( \partial _ { k _ { 1 } k _ { 2 } } h _ { i j } ) I _ { h } ^ { i j ( k _ { 1 } k _ { 2 } ) }] | Predict: [( \

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Truth: [{ \partial } _ { \mu } G ^ { { \mu } { \nu } { \sigma } } = 0 ,] | Predict: [\partial _ { \mu } G ^ { \mu \nu \sigma } = 0 ,]
Truth: [{ \cal X } _ { \alpha } L = D \Omega _ { \alpha } \, ,] | Predict: [{ \cal R } _ { \alpha } L = D \Omega _ { \alpha } ,]
Truth: [\Delta _ { \omega } ~ + ~ \Delta _ { \dot { \omega } } ~ = ~ 0] | Predict: [\Delta _ { c } \ + ~ \Delta _ { c } \ = \ 0]
Truth: [( \partial _ { k _ { 1 } k _ { 2 } } h _ { i j } ) I _ { h } ^ { i j ( k _ { 1 } k _ { 2 } ) }] | Predict: [( \partial _ { k _ { k } k _ { i } } \eta _ { i j } ^ { i ( k _ { 1 } , k _ { 2 } ) }]
Truth: [\chi _ { 0 0 } = - 2 i M \varepsilon _ { 0 }] | Predict: [\chi _ { 0 0 } = - 2 i M _ { 0 }]
Truth: [{ \vec { e } } _ { 3 } = \beta { \vec { B } } _ { 3 } + { \vec { e } } _ { 3 \perp }] | Predict: [\begin { a r r a y } { c c } { e _ { 3 } = \beta \bar { E } _ { 3 } + { \vec { S } _ { 3 } } ^ { \dagger } = \hat { E } _ { 3 } ^ { \dagger } + { \bar { s } _ { 3 } } ^ { \dagger } = \hat { E } _ { 3

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Validation: |          | 0/? [00:00<?, ?it/s]

Truth: [{ \partial } _ { \mu } G ^ { { \mu } { \nu } { \sigma } } = 0 ,] | Predict: [\partial _ { \mu } G ^ { \mu \nu \sigma } = 0 ,]
Truth: [{ \cal X } _ { \alpha } L = D \Omega _ { \alpha } \, ,] | Predict: [{ \cal R } _ { \alpha } L = D \Omega _ { \alpha } \, ,]
Truth: [\Delta _ { \omega } ~ + ~ \Delta _ { \dot { \omega } } ~ = ~ 0] | Predict: [\Delta _ { c } ~ ~ ~ ~ ~ \Delta _ { c } \; = \; 0]
Truth: [( \partial _ { k _ { 1 } k _ { 2 } } h _ { i j } ) I _ { h } ^ { i j ( k _ { 1 } k _ { 2 } ) }] | Predict: [( \partial _ { k _ { i } k _ { 2 } } h _ { i j } h _ { i j } h _ { i j } ^ { i j } h _ { i j } ) I _ { h } ^ { i j ( k , k _ { 2 } ) } .]
Truth: [\chi _ { 0 0 } = - 2 i M \varepsilon _ { 0 }] | Predict: [\chi _ { 0 0 } = - 2 i M \varepsilon _ { 0 }]
Truth: [{ \vec { e } } _ { 3 } = \beta { \vec { B } } _ { 3 } + { \vec { e } } _ { 3 \perp }] | Predict: [{ \cal E } _ { 3 } = \beta \hat { B } _ { 3 } + \bar { E } _ { 3 } + \varepsilon _ { 3 1 }]
Truth: [\mid { \bf \cdot } \, \rang

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


训练

In [21]:

ckpt_path = "D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt"

if __name__ == '__main__':
    trainer = pl.Trainer(
        callbacks=[lr_monitor],
        accelerator="gpu",
        devices = 1,
        log_every_n_steps=1,
        gradient_clip_val=0,
        accumulate_grad_batches=accumulate_grad_batches,
        max_epochs=max_epoch,
        default_root_dir="./data/input/models/"
    )
    
    # trainer.predict(datamodule=dm, model=model, ckpt_path="./model.ckpt")
    print("=" * 10 + "[Train]" + "=" * 10)
    # trainer.fit(datamodule=dm, model=model, ckpt_path=ckpt_path)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs




验证

In [22]:
trainer.validate(datamodule=dm, model=model, ckpt_path=ckpt_path)

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\utilities\parsing.py:43: attribute 'text' removed from hparams because it cannot be pickled
Restoring states from the checkpoint path at D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt


Validation: |          | 0/? [00:00<?, ?it/s]

Truth: [{ \partial } _ { \mu } G ^ { { \mu } { \nu } { \sigma } } = 0 ,] | Predict: [\partial _ { \mu } G ^ { \mu \sigma \sigma } = 0 ,]
Truth: [{ \cal X } _ { \alpha } L = D \Omega _ { \alpha } \, ,] | Predict: [{ \cal X } _ { \alpha } L = D \Omega _ { \alpha } \, , \qquad]
Truth: [\Delta _ { \omega } ~ + ~ \Delta _ { \dot { \omega } } ~ = ~ 0] | Predict: [\Delta _ { \omega } ~ + ~ \Delta _ { \omega } \ = \ 0]
Truth: [( \partial _ { k _ { 1 } k _ { 2 } } h _ { i j } ) I _ { h } ^ { i j ( k _ { 1 } k _ { 2 } ) }] | Predict: [( \partial _ { k _ { i j } } h _ { i j } ) I _ { i j } ^ { i j ( k , k _ { 2 } ) }]
Truth: [\chi _ { 0 0 } = - 2 i M \varepsilon _ { 0 }] | Predict: [\chi _ { 0 0 } = - 2 i M \varepsilon _ { 0 }]
Truth: [{ \vec { e } } _ { 3 } = \beta { \vec { B } } _ { 3 } + { \vec { e } } _ { 3 \perp }] | Predict: [\vec { e } _ { 3 } = \beta \bar { B } _ { 3 } + \vec { e } _ { 3 1 }]
Truth: [\mid { \bf \cdot } \, \rangle = \; \mid \psi _ { 0 } ( x ) \, \rangle _ { R }] | Predict:

测试

In [23]:
trainer.test(datamodule=dm, model=model, ckpt_path=ckpt_path)

Restoring states from the checkpoint path at D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at D:/Work/VScode/AI/image-2-latex-main/image-2-latex-main/epoch=7-step=9344.ckpt
d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Truth: [\hat { \omega } _ { \bar { s } | 2 } ^ { \phantom { \mu | A } 1 } = 0 .] | Predict: [\hat { \omega } _ { x _ { j } } ^ { \ \ \ \ 1 } = 0 .]
Truth: [\Phi \stackrel { \widehat { \cal T } } { \longrightarrow } \widetilde \Phi .] | Predict: [\Phi \stackrel { \tilde { \tau } } { \tilde { \tau } } _ { - } \, \tilde { \bar { \Phi } } .]
Truth: [( \widetilde { a \ast b } ) _ { \ell } = \widetilde { a } _ { \ell } \widetilde { b }] | Predict: [( { } ^ { - } { \bf x } } b ) _ { \ell } = { \bar { a } } _ { \bar { q } } \bar { b }]
Truth: [{ \tilde { E } } _ { \alpha } = A _ { \alpha } ^ { \beta } E _ { \beta }] | Predict: [\tilde { E } _ { \alpha } = A _ { \beta } ^ { \beta } E _ { \beta }]
Truth: [V _ { i j } ~ = ~ \mathrm { d i a g } \{ \delta _ { i } \} \, .] | Predict: [V _ { i j } \ = \ \mathrm { d i a g } \{ \delta _ { i } \} \, .]
Truth: [S = -] | Predict: [S = -]
Truth: [g _ { \mu \nu } = y ^ { - 2 } \delta _ { \mu \nu } .] | Predict: [g _ { \mu \nu } = y ^ { - 2 } \delta _ { \mu 

d:\WorkSpace\Anaconda\envs\i2l-torch\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 10. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


[{'test_loss': 0.24313154816627502,
  'test_bleu4': 0.828191876411438,
  'test_exact_match': 0.2262944132089615}]