In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./best_resnet50_biomass.pth
./notebookaaee1b289c.ipynb
./sample_submission.csv
./submission.csv
./test.csv
./train.csv
./.git\COMMIT_EDITMSG
./.git\config
./.git\description
./.git\HEAD
./.git\index
./.git\hooks\applypatch-msg.sample
./.git\hooks\commit-msg.sample
./.git\hooks\fsmonitor-watchman.sample
./.git\hooks\post-update.sample
./.git\hooks\pre-applypatch.sample
./.git\hooks\pre-commit.sample
./.git\hooks\pre-merge-commit.sample
./.git\hooks\pre-push.sample
./.git\hooks\pre-rebase.sample
./.git\hooks\pre-receive.sample
./.git\hooks\prepare-commit-msg.sample
./.git\hooks\push-to-checkout.sample
./.git\hooks\sendemail-validate.sample
./.git\hooks\update.sample
./.git\info\exclude
./.git\logs\HEAD
./.git\logs\refs\heads\main
./.git\logs\refs\remotes\origin\main
./.git\objects\00\1bd38740cab369c072e9becbae7a46d7265039
./.git\objects\01\9d86259807a1ed5e18bcea7f624899f9ae4114
./.git\objects\01\af8d4c00513f5679ead628362b1f378bb5a98a
./.git\objects\01\f0cfc8716cb0bea1391e4fd07fd42b1f0fa1

In [5]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# -------------------------- 1. 配置全局参数 --------------------------
# 设备配置（自动识别GPU/CPU）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

"""
# 数据路径（Kaggle竞赛数据默认路径，无需修改）
TRAIN_IMAGE_DIR = "/kaggle/input/csiro-biomass/train"
TRAIN_CSV_PATH = "/kaggle/input/csiro-biomass/train.csv"
TEST_CSV_PATH = "/kaggle/input/csiro-biomass/test.csv"
SUBMISSION_CSV_PATH = "/kaggle/input/csiro-biomass/sample_submission.csv"
"""
TRAIN_IMAGE_DIR = "./train"
TEST_IMAGE_DIR = "./test"
TRAIN_CSV_PATH = "./train.csv"
TEST_CSV_PATH = "./test.csv"
SUBMISSION_CSV_PATH = "./sample_submission.csv"
# 训练参数
BATCH_SIZE = 32  # 根据GPU显存调整（Kaggle免费GPU建议32/64）
EPOCHS = 20  # 可根据验证集性能调整
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5  # 防止过拟合
NUM_TARGETS = 5  # 竞赛需预测的5类生物量

# 定义目标变量名称列表（需与竞赛要求一致）
target_names = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']    

使用设备: cpu


In [None]:
# -------------------------- 2. 数据预处理与Dataset定义 --------------------------
# 图像预处理（训练集添加数据增强，测试集仅基础预处理）
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ResNet默认输入尺寸
    transforms.RandomHorizontalFlip(p=0.5),  # 随机水平翻转
    transforms.RandomVerticalFlip(p=0.3),  # 随机垂直翻转
    transforms.RandomRotation(degrees=15),  # 随机旋转±15°
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # 亮度/对比度调整
    transforms.ToTensor(),  # 转为Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet预训练均值
                         std=[0.229, 0.224, 0.225])    # ImageNet预训练方差
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# 自定义Dataset类（适配竞赛数据格式）
class BiomassDataset(Dataset):
    def __init__(self, csv_path, image_dir, transform=None, is_test=False):
        self.df = pd.read_csv(csv_path)
        self.df = self.df.fillna(0.0)
        self.image_dir = image_dir
        self.transform = transform
        self.is_test = is_test
        
        if is_test:
            # self.sample_ids = self.df['sample_id'].tolist()
            # combine test samples with same actual_sample_id
            self.df["actual_sample_id"] = self.df["sample_id"].str.split("_").str[0]
            self.df = self.df.drop_duplicates(subset="actual_sample_id", keep="first")
            self.sample_ids = self.df["actual_sample_id"].tolist()
        else: 
        # 训练集：将5类目标变量整理为矩阵（sample_id为key，5个target为value）
        # 测试集：无需目标值，仅返回sample_id
        # pivot_table： 透视表函数，按指定列分组，将某列的唯一值作为新列，对应的值填充到新列中
            self.df['sample_id'] = self.df['sample_id'].str.split('_').str[0]
            self.target_map = self.df.pivot_table(
                index="sample_id", # 按sample_id分组，每个sample_id对应一行
                columns="target_name", # 将target_name的唯一值（5个生物量名称）作为新列
                values="target" # 透视值：用target列的数值填充到新列对应的单元格
            ).reset_index() # 把sample_id从索引列转换为普通列，确保后续能通过iloc[idx]按索引获取到sample_id
            self.target_cols = self.target_map.columns[1:]  # 5类生物量列名
    
    def __len__(self):
        # 返回数据集的总样本数
        return len(self.target_map) if not self.is_test else len(self.sample_ids)
    
    def __getitem__(self, idx):
        # 按索引idx读取单个样本
        if not self.is_test:
            # 训练集：读取图像+对应的5个目标值
            sample_id = self.target_map.iloc[idx]["sample_id"]
            image_path = os.path.join(self.image_dir, f"{sample_id}.jpg")

            target_values = self.target_map.iloc[idx][self.target_cols].values.astype(np.float32)
            targets = torch.tensor(target_values, dtype=torch.float32)
        
            # 读取图像（确保为RGB格式）
            image = Image.open(image_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
            
            return (image, targets, sample_id)

        else:
            # 测试集：仅读取图像和sample_id（用于后续提交）
            sample_id = self.sample_ids[idx]
            sample_id = sample_id.split('_')[0]
            image_path = os.path.join(self.image_dir, f"{sample_id}.jpg")
            image = Image.open(image_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
            return (image, sample_id)


    

In [None]:
# 构建训练集/验证集（按8:2分割）
train_full_dataset = BiomassDataset(TRAIN_CSV_PATH, TRAIN_IMAGE_DIR, train_transform, is_test=False)
print(f"训练集样本数: {len(train_full_dataset)}")
train_dataset, val_dataset = train_test_split(
    train_full_dataset, test_size=0.2, random_state=42, shuffle=True
)
print(f"训练集样本数: {len(train_dataset)}")
print(f"验证集样本数: {len(val_dataset)}")

# 构建测试集
test_dataset = BiomassDataset(TEST_CSV_PATH, TEST_IMAGE_DIR, test_transform, is_test=True)
print(f"测试集样本数: {len(test_dataset)}")
print(test_dataset.sample_ids)
# DataLoader（批量加载数据）
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)



训练集样本数: 357
训练集样本数: 285
验证集样本数: 72
测试集样本数: 1
['ID1001187975']


In [10]:
# -------------------------- 3. 模型定义（ResNet50适配多目标回归） --------------------------
class ResNet50Biomass(nn.Module):
    def __init__(self, num_targets=NUM_TARGETS):
        super().__init__()
        # 加载预训练ResNet50（冻结初始层，仅微调顶层）
        self.resnet50 = models.resnet50(pretrained=True)
        
        # 冻结主干网络（可选：小数据集场景下避免过拟合，后续可解冻微调）
        for param in self.resnet50.parameters():
            param.requires_grad = False
        
        # 替换分类头为多目标回归头（ResNet50最后一层是1000维分类层）
        in_features = self.resnet50.fc.in_features
        self.resnet50.fc = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),  # 防止过拟合
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_targets)  # 输出5个目标值
        )
    
    def forward(self, x):
        return self.resnet50(x)

# 初始化模型并移至设备
model = ResNet50Biomass(num_targets=NUM_TARGETS).to(device)





In [60]:
# -------------------------- 4. 损失函数与优化器定义 --------------------------
criterion = nn.MSELoss()  # 回归任务常用损失函数
optimizer = optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
# 学习率调度器（可选：根据验证集性能调整学习率）
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5, verbose=True)





In [66]:
# -------------------------- 5. 训练与验证流程 --------------------------
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for images, targets, _ in tqdm(loader, desc="Training"):
        images, targets = images.to(device), targets.to(device)
        
        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        # 反向传播+优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * images.size(0)
    
    avg_loss = total_loss / len(loader.dataset)
    return avg_loss

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():  # 验证时禁用梯度计算
        for images, targets, _ in tqdm(loader, desc="Validation"):
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * images.size(0)
    
    avg_loss = total_loss / len(loader.dataset)
    return avg_loss

# 开始训练
best_val_loss = float("inf")
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = validate_one_epoch(model, val_loader, criterion, device)
    
    # 学习率调度
    scheduler.step(val_loss)
    
    # 保存最优模型（基于验证集损失）
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print(f"验证集损失改进：{best_val_loss:.4f}")
        torch.save(model.state_dict(), "best_resnet50_biomass.pth")
        print(f"保存最优模型，当前验证集损失：{best_val_loss:.4f}")
    
    print(f"训练集损失：{train_loss:.4f} | 验证集损失：{val_loss:.4f}")




Epoch 1/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.34s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.64s/it]


验证集损失改进：353.4712
保存最优模型，当前验证集损失：353.4712
训练集损失：449.9516 | 验证集损失：353.4712

Epoch 2/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.44s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.44s/it]


训练集损失：450.1906 | 验证集损失：353.7505

Epoch 3/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.40s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.61s/it]


验证集损失改进：352.6447
保存最优模型，当前验证集损失：352.6447
训练集损失：437.1217 | 验证集损失：352.6447

Epoch 4/20


Training: 100%|██████████| 9/9 [00:13<00:00,  1.48s/it]
Validation: 100%|██████████| 3/3 [00:05<00:00,  1.73s/it]


验证集损失改进：350.3970
保存最优模型，当前验证集损失：350.3970
训练集损失：421.8287 | 验证集损失：350.3970

Epoch 5/20


Training: 100%|██████████| 9/9 [00:13<00:00,  1.47s/it]
Validation: 100%|██████████| 3/3 [00:05<00:00,  1.70s/it]


验证集损失改进：349.6241
保存最优模型，当前验证集损失：349.6241
训练集损失：414.0498 | 验证集损失：349.6241

Epoch 6/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.41s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.43s/it]


训练集损失：428.9830 | 验证集损失：351.5719

Epoch 7/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.43s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]


验证集损失改进：346.6562
保存最优模型，当前验证集损失：346.6562
训练集损失：424.4240 | 验证集损失：346.6562

Epoch 8/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.30s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it]


训练集损失：404.6816 | 验证集损失：347.4932

Epoch 9/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.28s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


训练集损失：403.3181 | 验证集损失：347.1377

Epoch 10/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.34s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it]


训练集损失：383.8875 | 验证集损失：347.0687

Epoch 11/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.25s/it]
Validation: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]


训练集损失：395.5000 | 验证集损失：349.4248

Epoch 12/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.30s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]


验证集损失改进：343.4409
保存最优模型，当前验证集损失：343.4409
训练集损失：392.9222 | 验证集损失：343.4409

Epoch 13/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.29s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]


验证集损失改进：342.4153
保存最优模型，当前验证集损失：342.4153
训练集损失：385.6656 | 验证集损失：342.4153

Epoch 14/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.30s/it]
Validation: 100%|██████████| 3/3 [00:03<00:00,  1.32s/it]


训练集损失：385.9091 | 验证集损失：342.5421

Epoch 15/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.29s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


训练集损失：394.1371 | 验证集损失：343.3767

Epoch 16/20


Training: 100%|██████████| 9/9 [00:11<00:00,  1.32s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.56s/it]


训练集损失：371.6560 | 验证集损失：343.8906

Epoch 17/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.43s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]


训练集损失：367.2535 | 验证集损失：344.2161

Epoch 18/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.36s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.64s/it]


训练集损失：373.4237 | 验证集损失：343.5097

Epoch 19/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.40s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.65s/it]


验证集损失改进：342.1362
保存最优模型，当前验证集损失：342.1362
训练集损失：379.0205 | 验证集损失：342.1362

Epoch 20/20


Training: 100%|██████████| 9/9 [00:12<00:00,  1.38s/it]
Validation: 100%|██████████| 3/3 [00:04<00:00,  1.65s/it]

训练集损失：379.1552 | 验证集损失：343.9475





In [14]:
# -------------------------- 6. 测试集预测与提交文件生成 --------------------------
def predict_test(model, loader, device, target_names):
    model.eval()
    predictions = []
    sample_ids = []
    with torch.no_grad():
        for images, sample_id in tqdm(loader, desc="Testing"):
            images = images.to(device)
            outputs = model(images)
            predictions.extend(outputs.cpu().numpy())
            sample_ids.extend(sample_id)
    
        print("predictions:",predictions)
        print("sample_ids:",sample_ids)

    # 整理提交格式：sample_id__target_name, target
    submission_list = []
    for idx, sid in enumerate(sample_ids):
        pred = predictions[idx]
        for i, tn in enumerate(target_names):
            original_sample_id = sid + "__" + tn
            submission_list.append({
                'sample_id': original_sample_id,
                'target': pred[i]
            })
    
    submission_df = pd.DataFrame(submission_list)
    # 按sample_id排序
    submission_df = submission_df.sort_values("sample_id").reset_index(drop=True)
    return submission_df



# # 加载最优模型进行预测
model.load_state_dict(torch.load("best_resnet50_biomass.pth"))
print("加载最优模型完成")
# 生成提交文件
print(len(test_loader.dataset))
submission_df = predict_test(model, test_loader, device, target_names)
submission_df.to_csv("./submission.csv", index=False)
# print("提交文件生成完成！")
print(submission_df.head())

  model.load_state_dict(torch.load("best_resnet50_biomass.pth"))


加载最优模型完成
1


Testing: 100%|██████████| 1/1 [00:00<00:00,  7.93it/s]

predictions: [array([ 6.114049, 14.239446, 33.508583, 52.216293, 39.088394],
      dtype=float32)]
sample_ids: ['ID1001187975']
                    sample_id     target
0  ID1001187975__Dry_Clover_g   6.114049
1    ID1001187975__Dry_Dead_g  14.239446
2   ID1001187975__Dry_Green_g  33.508583
3   ID1001187975__Dry_Total_g  52.216293
4         ID1001187975__GDM_g  39.088394





In [1]:
import pandas as pd

test_df = pd.read_csv("test.csv")

# 步骤1：提取纯样本ID
test_df["actual_sample_id"] = test_df["sample_id"].str.split("__").str[0]

# 步骤2：按纯样本ID去重，保留每组第一行（keep="last" 保留最后一行，效果一致）
test_df_merged = test_df.drop_duplicates(subset="actual_sample_id", keep="first")

# 验证
print(f"合并后行数：{len(test_df_merged)}")

合并后行数：1
