In [None]:
import torch
from torchvision.models.detection import fcos_resnet50_fpn,FCOS_ResNet50_FPN_Weights
from torchvision.transforms import ToTensor
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader
import torchvision.transforms.functional as F
import matplotlib.pyplot as plt

In [None]:
# 设置训练和测试的参数
num_classes = 21  # VOC数据集中的物体类别数量（包括背景）
batch_size = 2
num_epochs = 10

# 加载VOC数据集
train_dataset = VOCDetection(root='./', year='2007', image_set='train', transform=ToTensor())
test_dataset = VOCDetection(root='./', year='2007', image_set='test', transform=ToTensor())

# 创建数据加载器
train_loader = DataLoader(train_dataset,batch_size=1)
test_loader = DataLoader(test_dataset, batch_size=1)

In [None]:
# 加载预训练的Faster R-CNN模型
model = fcos_resnet50_fpn(weights=FCOS_ResNet50_FPN_Weights.DEFAULT)
model.eval()

In [None]:
# 将模型移动到设备上
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# 定义优化器和损失函数
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
# 训练模型
for epoch in range(num_epochs):
    model.train()
    for images, targets in train_loader:
        images = list(image.to(device) for image in images)
        targets = targets.items()
        print(targets)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        

        optimizer.zero_grad()
        outputs = model(images, targets)
        
        # 计算损失
        loss = sum(loss for loss in outputs.values())
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
# 迭代测试集并可视化proposal box
for i, (image, target) in enumerate(test_loader):
    images = [image.squeeze(0).to(device)]  # 将图像转移到设备上，并从大小为[1, 3, 500, 353]的张量中移除批次维度

    with torch.no_grad():
        predictions = model(images)  # 不传递目标框给模型

    # 获取第一阶段的边界框
    boxes = predictions[0]['boxes']

    # 可视化图片和边界框
    image = F.to_pil_image(image.squeeze(0).cpu())
    plt.imshow(image)
    ax = plt.gca()

    for box in boxes:
        xmin, ymin, xmax, ymax = box.tolist()
        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor='r', linewidth=2)
        ax.add_patch(rect)

    plt.axis('off')
    plt.show()

    if i == 3:  # 仅可视化四张测试图像
        break

In [None]:
# 结果可视化
import matplotlib.pyplot as plt
def rescale(box,h,w):
    min_side, max_side = [512,800]
    smallest_side = min(w,h)
    largest_side=max(w,h)
    scale=min_side/smallest_side
    if largest_side*scale>max_side:
        scale=max_side/largest_side
    box[:, [0, 2]] = box[:, [0, 2]] / scale 
    box[:, [1, 3]] = box[:, [1, 3]] / scale
    return box
def draw(s):
    plt.figure()
    c_d = {'person':'blue','cat':'red','dog':'brown'}
    img = cv2.cvtColor(cv2.imread(f'/kaggle/input/nn-hw2-inference/{s}.jpg'),cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    h,w = img.shape[0],img.shape[1]
    img = preprocess_img(img,[512,800])
    img = transforms.ToTensor()(img).unsqueeze(0)
    with torch.no_grad():
        out=model(img.cuda())
    scores = out[0][0]
    boxes = out[2][0]
    print(boxes.shape)
    boxes = rescale(boxes,h,w)
    print(boxes)
    ax = plt.gca()
    for i in range(len(scores)):
        if scores[i].item()>0.5:
            box = boxes[i]
            ax.add_patch(plt.Rectangle((box[0].item(), box[1].item()), 
                                       (box[2]-box[0]).item(), (box[3]-box[1]).item(), color=f"{c_d[s]}", fill=False, linewidth=1))
            ax.text(box[0].item(), box[1].item(), f"{s} {round(scores[i].item(),2)}", bbox={'facecolor':f"{c_d[s]}", 'alpha':0.5})
    plt.savefig(f"/kaggle/working/{s}_fcos.jpg")
draw('person')
draw('cat')
draw('dog')