In [None]:
import sys
import os
import random
import numpy as np
from tqdm import tqdm
from datetime import datetime

import wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import timm

sys.path.append("../ML710-Project")
sys.path.append("../ML710-Project/model_and_hyperpam")
sys.path.append("../ML710-Project/dataset")

from dataset.classification_dataset import (
    train_datasets,
)

from model_and_hyperpam import (
    SEED,
    BATCH_SIZE,
    EPOCHS,
    WANDB_API,
)

from parallel.model_parallel_class import ViTModelParallel

if not torch.cuda.is_available():
    raise RuntimeError('CUDA is not available')

torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
torch.manual_seed(SEED)

BATCH_SIZE = BATCH_SIZE //2
EPOCHS = EPOCHS // 2

model_base = timm.create_model('vit_large_patch16_224', pretrained=True, num_classes=5)

device_ids = [0, 1]
model_parallel = ViTModelParallel(model_base, device_ids)

wandb.login(key=WANDB_API)
run = wandb.init(project='ml710_project', entity='arcticfox', name='vit_model_parallel'+'_'+'_'+datetime.now().strftime('%Y%m%d_%H%M%S'), job_type="training",reinit=True)

train_dataloader = DataLoader(train_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
# test_dataloader = DataLoader(test_datasets, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)


def train(model, train_loader, optimizer, criterion, epoch,device=f'cuda:{device_ids[0]}'):
    model = model.to(device)
    model.train()
    for ep in tqdm(range(epoch)):
        this_epoch_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            target = target.to(output.device)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            this_epoch_loss += loss.item()
        this_epoch_loss /= len(train_loader)
        wandb.log({"train_loss": this_epoch_loss})
    return model

optimizer = optim.AdamW(model_parallel.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [1]:
import torch
import torch.nn as nn
import timm

class ViTModelParallel(nn.Module):
    def __init__(self, model_name, num_classes, device_ids):
        super().__init__()

        self.device_ids = device_ids

        self.model = timm.create_model(model_name, pretrained=True, num_classes=num_classes)

        # 将部分层分配到不同的设备
        self.model.patch_embed = self.model.patch_embed.to(device_ids[0])
        self.model.pos_drop = self.model.pos_drop.to(device_ids[0])

        self.model.blocks = nn.ModuleList(
            [block.to(device_ids[i % len(device_ids)]) for i, block in enumerate(self.model.blocks)]
        )

        self.model.norm = self.model.norm.to(device_ids[-1])
        self.model.head = self.model.head.to(device_ids[-1])
        self.glob_pool = nn.AdaptiveAvgPool2d((1, 1)).to(self.device_ids[-1])

    def forward(self, x):
        x = x.to(self.device_ids[0])
        x = self.model.patch_embed(x)
        x = self.model.pos_drop(x)

        for i, block in enumerate(self.model.blocks):
            x = x.to(self.device_ids[i % len(self.device_ids)])
            x = block(x)

        x = x.to(self.device_ids[-1])
        x = self.model.norm(x)
        x = x[:, 0]
        x = self.model.head(x)

        return x

def main():
    device_ids = ['cuda:0', 'cuda:1']  # 在这里设置要使用的GPU设备ID
    model_name = 'vit_large_patch16_224'
    num_classes = 5

    model_parallel = ViTModelParallel(model_name, num_classes, device_ids)

    # 测试模型
    x = torch.randn(8, 3, 224, 224).to(device_ids[0])
    output = model_parallel(x)
    print(output.shape)

if True:
    main()

torch.Size([8, 5])


In [None]:
import torch.distributed as dist

In [None]:
import torch
import torch.nn as nn
nn.parallel

In [None]:
def main():
    device_ids = ['cuda:0', 'cuda:1']  # 在这里设置要使用的GPU设备ID
    model_name = 'vit_large_patch16_224'
    num_classes = 5

    model_parallel = ViTModelParallel(model_name, num_classes, device_ids)

    # 测试模型
    x = torch.randn(8, 3, 224, 224).to(device_ids[0])
    output = model_parallel(x)
    print(output.shape)

if True:
    main()

In [None]:
model_no_parallel = timm.create_model('vit_large_patch16_224', pretrained=True, num_classes=5)
device_ids = ['cuda:0', 'cuda:1']
x = torch.randn(8, 3, 224, 224).to(device_ids[0])
model_no_parallel = model_no_parallel.to(device_ids[0])
output_no_parallel = model_no_parallel(x)
print(output_no_parallel.shape)

In [1]:
import timm

model = timm.create_model('vit_large_patch16_224', pretrained=True, num_classes=5)

In [2]:
# split the model.blocks to two parts
model.blocks_1 = nn.ModuleList(model.blocks[:len(model.blocks)//2])
model.blocks_2 = nn.ModuleList(model.blocks[len(model.blocks)//2:])

Sequential(
  (0): Block(
    (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=1024, out_features=3072, bias=True)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1024, out_features=1024, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (act): GELU(approximate='none')
      (drop1): Dropout(p=0.0, inplace=False)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (drop2): Dropout(p=0.0, inplace=False)
    )
  )
  (1): Block(
    (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=1024, out_features=3072, bias=True)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1

In [None]:
model_no_parallel.forward_features(x).shape