# load model


In [1]:
%load_ext autoreload
%autoreload 2

# cav-mae 的整体模型定义并导入预训练权重

In [4]:
import os
import torch
import timm
from src.models import CAVMAE

assert timm.__version__ == '0.4.5' # it is important to have right version of timm

model_path = 'pretrained_model/audio_model.pth'
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7" # select GPU
# CAV-MAE model with decoder
#print(torch.cuda.device_count())
torch.cuda.set_device(1)
audio_model = CAVMAE(audio_length=1024,
                     modality_specific_depth=11,
                     norm_pix_loss=True, tr_pos=False) # most models are trained with pixel normalization and non-trainabe positional embedding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mdl_weight = torch.load(model_path, map_location=device)
audio_model = torch.nn.DataParallel(audio_model) # it is important to convert the model to dataparallel object as all weights are saved in dataparallel format (i.e., in module.xxx)
miss, unexpected = audio_model.load_state_dict(mdl_weight, strict=False)
print('miss=',miss, 'unexpected=', unexpected) # check if all weights are correctly loaded

A CAV-MAE Model
Use norm_pix_loss:  True
Learnable Positional Embedding:  False
Number of Audio Patches: 512, Visual Patches: 196
Audio Positional Embedding Shape: torch.Size([1, 512, 768])
Visual Positional Embedding Shape: torch.Size([1, 196, 768])
miss= ['module.blocks_maedfer.patch_embed.proj.weight', 'module.blocks_maedfer.patch_embed.proj.bias', 'module.blocks_maedfer.blocks.0.norm1.weight', 'module.blocks_maedfer.blocks.0.norm1.bias', 'module.blocks_maedfer.blocks.0.attn.q_bias', 'module.blocks_maedfer.blocks.0.attn.v_bias', 'module.blocks_maedfer.blocks.0.attn.qkv.weight', 'module.blocks_maedfer.blocks.0.attn.proj.weight', 'module.blocks_maedfer.blocks.0.attn.proj.bias', 'module.blocks_maedfer.blocks.0.norm2.weight', 'module.blocks_maedfer.blocks.0.norm2.bias', 'module.blocks_maedfer.blocks.0.mlp.fc1.weight', 'module.blocks_maedfer.blocks.0.mlp.fc1.bias', 'module.blocks_maedfer.blocks.0.mlp.fc2.weight', 'module.blocks_maedfer.blocks.0.mlp.fc2.bias', 'module.blocks_maedfer.block

# 定义cav-mae模型，查看其中的video模块输出维度

In [1]:
import os
import torch
import timm
from src.models import CAVMAE

cav_mae_model = CAVMAE(audio_length=1024,
                       modality_specific_depth=11,
                       norm_pix_loss=True, tr_pos=False)

A CAV-MAE Model
Use norm_pix_loss:  True
Learnable Positional Embedding:  False
Number of Audio Patches: 512, Visual Patches: 196
Audio Positional Embedding Shape: torch.Size([1, 512, 768])
Visual Positional Embedding Shape: torch.Size([1, 196, 768])


In [18]:
audio_data = torch.rand(7,  128, 1024)

video_data = torch.rand(7, 3, 224, 224)
output = cav_mae_model(audio_data, video_data)

before visual encoder torch.Size([7, 49, 768])
after visual encoder torch.Size([7, 49, 768])


# 定义mae dfer模型

In [2]:
from src.models import modeling_finetune
import torch
from src.models.modeling_finetune import vit_base_patch16_160

# 定义模型
model1 = vit_base_patch16_160()


# 用maedfer 进行一次前向

In [4]:

# 创建随机输入数据
input_data = torch.rand(1, 3, 16, 160, 160)

# 进行前向传播
output1, output1_feature = model1(input_data,save_feature=True)
model1.reset_classifier( num_classes=10)
print(output1_feature.shape)
print(output1.shape)


torch.Size([1, 3, 16, 160, 160])
torch.Size([1, 3, 16, 160, 160])
torch.Size([1, 768])
torch.Size([1, 1000])
