<a href="https://colab.research.google.com/github/6V836sX/DATA425-Labs/blob/main/Assignment_2_Summe_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# from google.colab import drive

# # 挂载 Google Drive
# drive.mount('/content/drive')

# import os, shutil, pathlib

# src = "/content/drive/My Drive/Colab Notebooks/DATA425A2/data/CUB_200_2011/images"
# dst = "/content/images"
# if not pathlib.Path(dst).exists():  # 避免重复复制
#     print("Copying images to local VM...")
#     shutil.copytree(src, dst)
# else:
#     print("Images already copied.")


In [2]:
import os
# 设置基础路径
base_dir = './'

# 子路径
# data_dir = '/content/images'
data_dir = '../data/CUB_200_2011/images'
base_ckpt_dir = os.path.join(base_dir, 'checkpoints')
base_log_dir = os.path.join(base_dir, 'logs')
base_result_dir = os.path.join(base_dir, 'results')
notebook_dir = os.path.join(base_dir, 'notebook')

# === 路径检查 ===
# required_paths = {
#     "✅ Dataset path": data_dir,
#     "📁 Checkpoint dir": base_ckpt_dir,
#     "📁 Log dir": base_log_dir,
#     "📁 Result dir": base_result_dir,
# }

# # 检查 data_dir 是否存在（必须）
# if not os.path.exists(data_dir):
#     raise FileNotFoundError(f"❌ 数据集路径不存在：{data_dir}\n请检查是否已上传至 Google Drive 并命名正确。")

# print("🎉 数据集路径存在，开始创建保存目录...")

# # 其余路径：如果不存在则自动创建
# for desc, path in required_paths.items():
#     if not os.path.exists(path):
#         os.makedirs(path, exist_ok=True)
#         print(f"{desc} 创建成功: {path}")
#     else:
#         print(f"{desc} 已存在: {path}")



In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense
import os
def plot_loss_accuracy(history):
    """
    绘制训练过程中的 loss 和 sparse_categorical_accuracy 曲线。
    参数：
        history : tf.keras.callbacks.History 对象
    """
    historydf = pd.DataFrame(history.history, index=history.epoch)

    plt.figure(figsize=(10, 6))
    historydf[["loss", "val_loss", "sparse_categorical_accuracy", "val_sparse_categorical_accuracy"]].plot(
        ylim=(0, max(1.0, historydf.values.max())),
        title="Training and Validation Loss / Accuracy",
        grid=True,
        figsize=(10, 6)
    )

    final_loss = history.history['loss'][-1]
    final_acc = history.history['sparse_categorical_accuracy'][-1]
    plt.title(f'Final Loss: {final_loss:.3f}, Final Accuracy: {final_acc:.3f}')
    plt.xlabel("Epoch")
    plt.ylabel("Metric Value")
    plt.tight_layout()
    plt.show()


def plot_multiple_histories_with_annotations(histories, labels=None, metric="sparse_categorical_accuracy",
                                             figsize=(14, 6), save_path=None, dpi=300, file_format="png"):
    """
    绘制多个 history 对象的训练/验证曲线（支持不同颜色/线型，标注最高点，导出图像）

    参数：
    - histories: list of tf.keras.callbacks.History objects
    - labels: list of str, 用于标注每个模型
    - metric: str, 训练指标名，如 "sparse_categorical_accuracy", "loss"
    - figsize: tuple, 图像尺寸
    - save_path: str, 文件保存路径（无扩展名）
    - dpi: int, 导出图像分辨率
    - file_format: str, 'png' 或 'svg'
    """

    if labels is None:
        labels = [f"Model {i+1}" for i in range(len(histories))]

    colors = plt.cm.get_cmap('tab10', len(histories))  # 不同模型不同颜色

    plt.figure(figsize=figsize)

    # === 子图1：训练曲线（实线）
    plt.subplot(1, 2, 1)
    for i, (hist, label) in enumerate(zip(histories, labels)):
        plt.plot(hist.epoch, hist.history[metric], linestyle='-', color=colors(i), label=f"{label} (train)")
    plt.title(f"Training {metric}")
    plt.xlabel("Epoch")
    plt.ylabel(metric)
    plt.grid(True)
    plt.legend()

    # === 子图2：验证曲线（虚线 + 标注）
    plt.subplot(1, 2, 2)
    for i, (hist, label) in enumerate(zip(histories, labels)):
        val_metric = f"val_{metric}"
        val_values = hist.history[val_metric]
        epochs = hist.epoch
        plt.plot(epochs, val_values, linestyle='--', color=colors(i), label=f"{label} (val)")

        # 自动标注最大 val accuracy 位置
        best_epoch = int(pd.Series(val_values).idxmax())
        best_value = val_values[best_epoch]
        plt.scatter(best_epoch, best_value, color=colors(i), marker='o')
        plt.text(best_epoch, best_value + 0.01, f"{best_value:.3f}", fontsize=9, ha='center', color=colors(i))

    plt.title(f"Validation {metric}")
    plt.xlabel("Epoch")
    plt.ylabel(metric)
    plt.grid(True)
    plt.legend()

    plt.tight_layout()

    # 导出图像
    if save_path:
        full_path = f"{save_path}.{file_format}"
        plt.savefig(full_path, dpi=dpi, format=file_format)
        print(f"✅ 图像已保存为 {full_path}")

    plt.show()



# Data Preparation

In [4]:
# ImageNet normalization stats
IMAGENET_MEAN = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32)
IMAGENET_STD = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32)

# Custom preprocessing function
def preprocess_train(image, label):
    image = tf.image.resize_with_pad(image, 256, 256)  # 短边缩放到256，pad长边
    image = tf.image.random_crop(image, size=(224, 224, 3))  # 随机裁剪
    image = tf.image.random_flip_left_right(image)  # 随机水平翻转
    image = tf.image.random_brightness(image, max_delta=0.1)  # 明亮度抖动
    image = tf.cast(image, tf.float32) / 255.0  # 归一化到0~1
    image = (image - IMAGENET_MEAN) / IMAGENET_STD  # 使用ImageNet均值标准化
    return image, label

def preprocess_val(image, label):
    image = tf.image.resize_with_pad(image, 256, 256)
    image = tf.image.central_crop(image, central_fraction=0.875)  # 近似224 crop
    image = tf.cast(image, tf.float32) / 255.0
    image = (image - IMAGENET_MEAN) / IMAGENET_STD
    return image, label

# Load raw dataset
raw_dataset = tf.keras.utils.image_dataset_from_directory(
    '../data/CUB_200_2011/CUB_200_2011/images',
    labels='inferred',
    label_mode='int',
    image_size=(256, 256),  # 初步resize到统一尺寸（不作变形）
    batch_size=None,  # 返回未批量化的 (image, label), 保证 map 时传入的是单张图像
    shuffle=True,
    seed=888
)

# Train/Val split
total_size = 11788  # CUB-200-2011 总样本数
train_size = 5994   # 按照官方 split
val_size = total_size - train_size

train_ds = raw_dataset.take(train_size).map(preprocess_train).batch(256).shuffle(1000).prefetch(tf.data.AUTOTUNE)
val_ds = raw_dataset.skip(train_size).map(preprocess_val).batch(256).prefetch(tf.data.AUTOTUNE)


2025-05-22 16:10:44.218810: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-05-22 16:10:44.218831: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-22 16:10:44.218837: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-22 16:10:44.218852: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-22 16:10:44.218865: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Found 11788 files belonging to 200 classes.


In [5]:
# Check the dataset
for images, labels in train_ds.take(1):
    print(images.shape)  # (256, 224, 224, 3)
    print(labels)

2025-05-22 16:10:56.956513: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 5 of 1000
2025-05-22 16:11:08.329469: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 7 of 1000
2025-05-22 16:11:21.152314: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 9 of 1000
2025-05-22 16:11:39.700644: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 12 of 1000
2025-05-22 16:11:51.067298: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 14 of 1000
2025-05-22 16:12:09.123279: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 17 of 

(256, 224, 224, 3)
tf.Tensor(
[ 59 197 122  26  62 142 180  57 134 135  21 116  41  68  86 198  45  51
   8 198  13  61  90 158  77  28  73 165  63  27 101  87 104  34  45 118
  98  33  75  12  15  81  14   3  69 128  33  14  90 155  54 122 108 188
 128  45  19 194 123 124  29  47  69 126 148  70 127 146 178 159  78 107
 196 147  53  68 121   8  10  44   8   3 170 104  21 189 119  98 128  34
  77  84  76 146 126  28 112  46  58 147  53   8 110 139 112  65  32  92
 111 128   7  37 150   3 138  44 109   2   1  97 101 115 178  24 133 172
  71  75  31   8  13  85  87  41  24  28  81  71  19  72 167 194   3 143
 138  71 118  33  87  45 138   8 129 157  15 124   1  38 197 129  42 163
  27   8 166 110  11  71 154  43  29  26 103  41 122   4 189 151  52 158
  43  85 100  71  37  18  95 123 152  37  58 187  10 168  66  30 140 166
  22  58 188 146   2  94 190  70  39  29  76  10  38 142  51  33 188   5
 111 107 173 137  92 126  20  14 115  86  24  96  53 171 153 144  72 173
 115 136 141  72  34 

2025-05-22 16:12:47.217464: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
2025-05-22 16:12:47.371731: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Model Creation

## Model Build

In [6]:
from tensorflow.keras.applications import ResNet101V2
from tensorflow.keras import layers, Model, Input

def build_model():
    inputs = Input(shape=(224, 224, 3))
    base_model = ResNet101V2(weights="imagenet", include_top=False, input_tensor=inputs)
    base_model.trainable = True  # Full fine-tuning

    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x)
    # x = layers.Dropout(0.2)(x)  # optional

    outputs = layers.Dense(200, activation="softmax", name="Predictions")(x)

    model = Model(inputs=inputs, outputs=outputs)
    # model.summary(show_trainable=True)

    return model



## Model Compile

In [7]:
def compile_model(model, lr=0.01, m=0.9, wd=0.0001):
    optimizer = tf.keras.optimizers.SGD(
        learning_rate=lr,
        momentum=m,
        weight_decay=wd  # TF ≥ 2.9 iff available, our tf.__version__ = 2.16
    )

    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["sparse_categorical_accuracy"]
    )
    return model



## Model Fitting Control

### step decay

In [8]:
# Step Decay LearningRateScheduler Function（Li et al., 2020）
def step_decay(epoch):
    if epoch < 150:
        return 0.01
    elif epoch < 250:
        return 0.001
    else:
        return 0.0001

lr_callback = tf.keras.callbacks.LearningRateScheduler(step_decay)




### Callbacks

In [9]:
import os
from tensorflow import keras

checkpoint_path = "checkpoints/best_model.keras"
log_dir = "logs/single_run"

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        monitor="val_loss",
        save_best_only=True,
        mode="min",
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=15,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.TensorBoard(
        log_dir=log_dir,
        histogram_freq=1,
        write_graph=True,
        write_images=True
    ),
    lr_callback
]


## Output Settings

In [10]:
import datetime

# output dir
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M")
base_ckpt_dir = f"checkpoints_grid_{timestamp}"
base_log_dir = f"logs_grid_{timestamp}"
os.makedirs(base_ckpt_dir, exist_ok=True)
os.makedirs(base_log_dir, exist_ok=True)

# result initial
results = []



# Model Fitting

In [11]:
train_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [None]:
# ✅ 实验图 1：Validation Error vs Momentum（对比不同 λ）
def experiment_vary_momentum_vs_weight_decay():
    learning_rate = 0.01
    weight_decays = [0.0001, 0.0]
    momentums = [0.0, 0.8, 0.9, 0.95, 0.99]

    results = []

    for wd in weight_decays:
        for m in momentums:
            tag = f"exp1_lr{learning_rate}_wd{wd}_m{m}"
            print(f"\n🚀 [Exp1] Training: wd={wd}, m={m}")

            result = run_experiment(
                lr=learning_rate,
                m=m,
                wd=wd,
                tag=tag,
                epochs=30,
                use_early_stopping=False,
                train_ds=train_ds,
                val_ds=val_ds
            )
            result['weight_decay'] = wd
            results.append(result)

    return pd.DataFrame(results)


# ✅ 实验图 2：Validation Error vs Learning Rate（对比不同 momentum）
def experiment_vary_lr_vs_momentum():
    weight_decay = 0.0001
    learning_rates = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
    momentums = [0.9, 0.0]

    results = []

    for m in momentums:
        for lr in learning_rates:
            tag = f"exp2_m{m}_lr{lr}_wd{weight_decay}"
            print(f"\n🚀 [Exp2] Training: m={m}, lr={lr}")

            result = run_experiment(
                lr=lr,
                m=m,
                wd=weight_decay,
                tag=tag,
                epochs=30,
                use_early_stopping=False,
                train_ds=train_ds,
                val_ds=val_ds
            )
            result['momentum'] = m
            results.append(result)

    return pd.DataFrame(results)


# ✅ run_experiment 完整定义（支持传入数据）
def run_experiment(lr=0.01, m=0.9, wd=0.0001, tag=None, epochs=30, 
                   use_early_stopping=True, train_ds=None, val_ds=None):
    import tensorflow as tf
    from tensorflow import keras
    import os

    model = build_model()
    optimizer = tf.keras.optimizers.SGD(learning_rate=lr, momentum=m, weight_decay=wd)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    callbacks = []
    if tag:
        ckpt_dir = os.path.join(base_ckpt_dir, tag)
        os.makedirs(ckpt_dir, exist_ok=True)
        callbacks.append(keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(ckpt_dir, "best_model.keras"),
            save_best_only=True,
            monitor="val_loss",
            mode="min",
            verbose=1
        ))
    if use_early_stopping:
        callbacks.append(keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=15, restore_best_weights=True, verbose=0))

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )

    best_val_acc = max(history.history["val_accuracy"])
    best_val_loss = min(history.history["val_loss"])
    best_epoch = history.history["val_loss"].index(best_val_loss)

    return {
        "learning_rate": lr,
        "momentum": m,
        "weight_decay": wd,
        "best_val_accuracy": best_val_acc,
        "best_val_loss": best_val_loss,
        "epoch": best_epoch
    }


# ✅ 绘图函数：Line Plot 并标注 min top-1（用于 df_exp1 或 df_exp2）
def plot_validation_error_line(df, x_var, group_var, title, xlabel, output_path):
    import matplotlib.pyplot as plt

    plt.figure(figsize=(8, 6))
    grouped = df.groupby(group_var)
    for name, group in grouped:
        x = group[x_var]
        y = 100 * (1 - group['best_val_accuracy'])  # 转换为 Validation Error
        plt.plot(x, y, marker='o', label=f"{group_var}={name}")

        # 标出 min point
        min_idx = y.idxmin()
        min_x = x[min_idx]
        min_y = y[min_idx]
        min_top1 = min_y
        plt.annotate(f"min top1={min_top1:.2f}",
                     (min_x, min_y),
                     textcoords="offset points",
                     xytext=(0,10),
                     ha='center')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Validation Error")
    if x_var == "learning_rate":
        plt.xscale("log")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.show()


# ✅ 运行示例：
df_exp1 = experiment_vary_momentum_vs_weight_decay()
df_exp1.to_csv("exp1.csv", index=False)
plot_validation_error_line(df_exp1, x_var="momentum", group_var="weight_decay",
                            title="birds, imagenet, η=0.01, n=64",
                            xlabel="Momentum m", output_path="exp1_val_error.png")

df_exp2 = experiment_vary_lr_vs_momentum()
df_exp2.to_csv("exp2.csv", index=False)
plot_validation_error_line(df_exp2, x_var="learning_rate", group_var="momentum",
                            title="birds, imagenet, λ=0.0001, n=64",
                            xlabel="learning rate η", output_path="exp2_val_error.png")




🚀 [Exp1] Training: wd=0.0001, m=0.0
Epoch 1/30


2025-05-22 16:12:52.560727: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2025-05-22 16:13:08.739828: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 19 of 1000
2025-05-22 16:13:20.764609: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:9: Filling up shuffle buffer (this may take a while): 23 of 1000
2025-05-22 16:13:22.062016: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.



Epoch 1: val_loss improved from inf to 6.56398, saving model to checkpoints_grid_20250522-1612/exp1_lr0.01_wd0.0001_m0.0/best_model.keras
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 20s/step - val_accuracy: 0.0036 - val_loss: 6.5640
Epoch 2/30
