In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

Неоптимизированный код

In [None]:
class Trainer():
    def __init__(self, optimizer, params, device):
        self.optimizer = optimizer
        self.params = params
        self.device = device
    
    def train_model(self, model, tokenizer, train_dataloader, val_dataloader, writer=None):
        train_loss = []
        val_loss = []
        tokens_get = 0
        for epoch in range(self.params['N_EPOCHS']):

            
            for x, y in train_dataloader:
                if not (model.training):
                    model.train()
                x, y = x.to(self.device), y.to(self.device)
                self.optimizer.zero_grad()
                logits = model(x)
                loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())
                loss.backward()
                self.optimizer.step()
                train_loss.append(loss)
                tokens_get += len(x.flatten())

                if (self.params['verbose'] is True) and (tokens_get % self.params['verbose_freq'] == 0):
                    sample = tokenizer.decode(generate(model=model, idx=torch.tensor(tokenizer('Я большая языковая модель и ')['input_ids'], device=self.device).unsqueeze(0), max_new_tokens=25, context_size=1024).squeeze(0).tolist())
                    print(f'Epoch {epoch}: Train loss = {loss}, sample: {sample}')
                    if (writer is not None):
                        writer.add_scalar("Loss/train in step", loss, epoch)
                        writer.add_text("Sample", str(sample), epoch)
                        if (self.params['gradients'] is True):
                            grads = []
                            for name, param in model.named_parameters():
                                if ('weight' in name):
                                    grads.append(param.grad.abs().flatten().mean().cpu().detach().numpy())
                            writer.add_scalar("train/gradients", np.array(grads).flatten().mean(), epoch)


                    model.eval()
                    with torch.no_grad():
                        for x, y in val_dataloader:
                            x, y = x.to(self.device), y.to(self.device)
                            logits = model(x)
                            loss = nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten())
                            val_loss.append(loss)
                        if (writer is not None):
                            writer.add_scalar("Loss/train in check", torch.mean(torch.tensor(train_loss, device='cpu')), epoch)
                            writer.add_scalar("Loss/val in check", torch.mean(torch.tensor(val_loss, device='cpu')), epoch)
            writer.close()
            

Код по train от ChatGPT

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader, Dataset

# --- Model Definition ---
class GPT(nn.Module):
    def __init__(self, hidden_size=768, num_layers=12, context_size=2048):
        super().__init__()
        self.hidden_size = hidden_size
        self.context_size = context_size
        # Placeholder for embedding
        self.token_emb = nn.Embedding(num_embeddings=50000, embedding_dim=hidden_size)
        # Stub transformer blocks without attention
        self.blocks = nn.ModuleList([
            # Тут заглушка просто для демонстрации кода оптимизации инференса везде, кроме multi-head self-attention
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size)
            ) for _ in range(num_layers)
        ])
        # Final layer norm & head
        self.ln_f = nn.LayerNorm(hidden_size)
        self.head = nn.Linear(hidden_size, 50000, bias=False)

    def forward(self, x):
        # x: (batch, seq_len)
        x = self.token_emb(x)  # (B, T, H)
        # Iterate through blocks, using checkpoint to save memory
        for block in self.blocks:
            x = checkpoint(block, x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

# --- Dummy Dataset ---
class TextDataset(Dataset):
    def __init__(self, txt_files):
        # Load and preprocess your .txt data here...
        ...
    def __len__(self):
        ...
    def __getitem__(self, idx):
        # Return tokenized input and target
        ...

# --- Training Loop ---
def train(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer,
    device,
    epochs: int = 3,
    accum_steps: int = 4
):
    model.to(device)
    # Compile model for speed/memory optimizations
    model = torch.compile(model)

    scaler = torch.cuda.amp.GradScaler()
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for step, batch in enumerate(dataloader):
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            with torch.cuda.amp.autocast():
                logits = model(inputs)
                loss = criterion(
                    logits.view(-1, logits.size(-1)),
                    targets.view(-1)
                )
                loss = loss / accum_steps

            scaler.scale(loss).backward()

            if (step + 1) % accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            running_loss += loss.item() * accum_steps
            if (step + 1) % (accum_steps * 100) == 0:
                avg = running_loss / (accum_steps * 100)
                print(f"Epoch {epoch+1}, Step {step+1}, Loss: {avg:.4f}")
                running_loss = 0.0

# --- Main Script ---
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Hyperparameters
    hidden_size = 768
    num_layers = 8
    context_size = 2048
    batch_size = 4
    accum_steps = 8
    lr = 5e-4
    epochs = 5

    # Prepare data
    dataset = TextDataset(txt_files=["data1.txt", "data2.txt"])
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=4
    )

    # Initialize model
    model = GPT(hidden_size=hidden_size, num_layers=num_layers, context_size=context_size)

    # Choose optimizer (e.g., 8-bit Adam via bitsandbytes) or Adafactor
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    # For 8-bit: from bitsandbytes.optim import AdamW as AdamW8bit
    # optimizer = AdamW8bit(model.parameters(), lr=lr)

    # Start training
    train(
        model=model,
        dataloader=dataloader,
        optimizer=optimizer,
        device=device,
        epochs=epochs,
        accum_steps=accum_steps
    )

Еще версия

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader, Dataset

# --- Precision and MatMul Settings ---
# Control default matmul precision for float32 (can be 'low', 'medium', 'high')
torch.set_float32_matmul_precision('high')

# --- Model Definition ---
class GPT(nn.Module):
    def __init__(self, hidden_size=768, num_layers=12, context_size=2048, dtype=torch.float32):
        super().__init__()
        self.hidden_size = hidden_size
        self.context_size = context_size
        # Placeholder for embedding
        self.token_emb = nn.Embedding(num_embeddings=50000, embedding_dim=hidden_size)
        # Stub transformer blocks without attention
        self.blocks = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size)
            ) for _ in range(num_layers)
        ])
        # Final layer norm & head
        self.ln_f = nn.LayerNorm(hidden_size)
        self.head = nn.Linear(hidden_size, 50000, bias=False)
        # Set default dtype for buffers and weights
        self.to(dtype)

    def forward(self, x):
        # x: (batch, seq_len)
        x = self.token_emb(x)
        for block in self.blocks:
            x = checkpoint(block, x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

# --- Dummy Dataset ---
class TextDataset(Dataset):
    def __init__(self, txt_files):
        # Load and preprocess your .txt data here...
        ...
    def __len__(self):
        ...
    def __getitem__(self, idx):
        # Return tokenized input and target
        ...

# --- Training Loop ---
def train(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer,
    device,
    dtype,
    epochs: int = 3,
    accum_steps: int = 4
):
    # Move model to device and target dtype
    model.to(device).to(dtype)
    # Compile model for speed/memory optimizations
    model = torch.compile(model, backend='inductor')

    scaler = torch.cuda.amp.GradScaler()
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for step, batch in enumerate(dataloader):
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Use autocast if dtype is half precision
            with torch.cuda.amp.autocast(enabled=(dtype==torch.float16)):
                logits = model(inputs)
                loss = criterion(
                    logits.view(-1, logits.size(-1)),
                    targets.view(-1)
                )
                loss = loss / accum_steps

            scaler.scale(loss).backward()

            if (step + 1) % accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            running_loss += loss.item() * accum_steps
            if (step + 1) % (accum_steps * 100) == 0:
                avg = running_loss / (accum_steps * 100)
                print(f"Epoch {epoch+1}, Step {step+1}, Loss: {avg:.4f}")
                running_loss = 0.0

# --- Main Script ---
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Hyperparameters
    hidden_size = 768
    num_layers = 8
    context_size = 2048
    batch_size = 4
    accum_steps = 8
    lr = 5e-4
    epochs = 5

    # Choose dtype: torch.float32 with controlled matmul precision, or torch.float16
    dtype = torch.float16  # or torch.float32

    # Prepare data
    dataset = TextDataset(txt_files=["data1.txt", "data2.txt"])
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=4
    )

    # Initialize model
    model = GPT(hidden_size=hidden_size, num_layers=num_layers, context_size=context_size, dtype=dtype)

    # Choose optimizer
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    # Start training
    train(
        model=model,
        dataloader=dataloader,
        optimizer=optimizer,
        device=device,
        dtype=dtype,
        epochs=epochs,
        accum_steps=accum_steps
    )
