In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import psutil
import time
import os
import json

# ==== 配置参数 ====
with open("config/exp1_resnet50_bs32_lr1e-3.json") as f:
    cfg = json.load(f)
    
BATCH_SIZE = cfg["batch_size"]
LR = cfg["learning_rate"]
EPOCHS = cfg["epochs"]
IMG_SIZE = cfg["img_size"]
NUM_CLASSES = cfg["num_classes"]

EXPERIMENT_NAME = f"TF_{cfg['model_name']}_bs{BATCH_SIZE}_lr{LR}_e{EPOCHS}"
MODEL_DIR = os.path.join("experiments", EXPERIMENT_NAME)

DISTRIBUTE = "single"
BUFFER_SIZE = 1000


# ==== 设置分布式策略 ====
if DISTRIBUTE == "single":
    strategy = tf.distribute.OneDeviceStrategy("/gpu:0" if tf.config.list_physical_devices('GPU') else "/cpu:0")
elif DISTRIBUTE == "mirror":
    strategy = tf.distribute.MirroredStrategy()
elif DISTRIBUTE == "multi":
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

dataset_info = tfds.builder("oxford_flowers102").info
TRAIN_EXAMPLES = int(dataset_info.splits["train"].num_examples * 0.8)
STEPS = TRAIN_EXAMPLES // (BATCH_SIZE * strategy.num_replicas_in_sync)

# ==== 记录训练前内存 ====
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 * 1024)  # MB
start_time = time.time()

# ==== 数据预处理函数 ====
def preprocess(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = tf.cast(image, tf.float32)
    image = tf.keras.applications.resnet50.preprocess_input(image)
    label = tf.cast(label, tf.int32)
    return image, label

# ==== 加载 Flowers102 数据 ====
ds_train = tfds.load("oxford_flowers102", split="train[:80%]", as_supervised=True)
ds_val = tfds.load("oxford_flowers102", split="train[80%:]", as_supervised=True)

ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE * strategy.num_replicas_in_sync).prefetch(tf.data.AUTOTUNE)

ds_val = ds_val.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
ds_val = ds_val.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# ==== 构建模型 ====
with strategy.scope():
    base_model = keras.applications.ResNet50(include_top=False, weights="imagenet", input_shape=(IMG_SIZE, IMG_SIZE, 3))
    base_model.trainable = False

    model = keras.Sequential([
        base_model,
        keras.layers.GlobalAveragePooling2D(),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(NUM_CLASSES, activation='softmax') 
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=LR),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# ==== 模型训练 ====
model.fit(ds_train, epochs=EPOCHS, steps_per_epoch=STEPS)

training_duration = time.time() - start_time
memory_after = process.memory_info().rss / (1024 * 1024)  # MB

# ==== 保存模型 ====
os.makedirs(MODEL_DIR, exist_ok=True)
tf.saved_model.save(model, MODEL_DIR)

# ==== 模型评估 ====
eval_start = time.time()
val_loss, val_acc = model.evaluate(ds_val)
eval_end = time.time()
inference_latency = eval_end - eval_start

# ==== 记录指标 ====
metrics = {
    "training_time_seconds": training_duration,
    "memory_usage_mb": memory_after - memory_before,
    "inference_latency_seconds": inference_latency,
    "test_loss": val_loss,
    "test_accuracy": val_acc
}
os.makedirs(MODEL_DIR, exist_ok=True)
with open(os.path.join(MODEL_DIR, "config.json"), "w") as f:
    json.dump(cfg, f, indent=2)

with open(os.path.join(MODEL_DIR, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

print("Done. Model and metrics saved in:", MODEL_DIR)


2025-05-27 05:09:17.316119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-05-27 05:09:17.318119: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2025-05-27 05:09:17.318285: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (google-summer-01): /proc/driver/nvidia/version does not exist
2025-05-27 05:09:17.334871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in oth

[1mDownloading and preparing dataset 328.90 MiB (download: 328.90 MiB, generated: 331.34 MiB, total: 660.25 MiB) to /home/jupyter/tensorflow_datasets/oxford_flowers102/2.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1020 [00:00<?, ? examples/s]

Shuffling /home/jupyter/tensorflow_datasets/oxford_flowers102/2.1.1.incomplete5JFH4K/oxford_flowers102-train.t…

Generating test examples...:   0%|          | 0/6149 [00:00<?, ? examples/s]

Shuffling /home/jupyter/tensorflow_datasets/oxford_flowers102/2.1.1.incomplete5JFH4K/oxford_flowers102-test.tf…

Generating validation examples...:   0%|          | 0/1020 [00:00<?, ? examples/s]

Shuffling /home/jupyter/tensorflow_datasets/oxford_flowers102/2.1.1.incomplete5JFH4K/oxford_flowers102-validat…

[1mDataset oxford_flowers102 downloaded and prepared to /home/jupyter/tensorflow_datasets/oxford_flowers102/2.1.1. Subsequent calls will reuse this data.[0m


2025-05-27 05:10:23.933920: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:549] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


Epoch 1/10


2025-05-27 05:10:24.194995: W tensorflow/core/framework/dataset.cc:769] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: experiments/TF_resnet50_bs32_lr0.001_e10/assets


INFO:tensorflow:Assets written to: experiments/TF_resnet50_bs32_lr0.001_e10/assets
2025-05-27 05:34:21.230939: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:549] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.
2025-05-27 05:34:21.464950: W tensorflow/core/framework/dataset.cc:769] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


Done. Model and metrics saved in: experiments/TF_resnet50_bs32_lr0.001_e10


In [2]:
tf.__version__

'2.11.0'