In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from sklearn.model_selection import KFold
import itertools
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

2025-12-12 22:42:57.683736: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-12 22:42:57.778543: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-12 22:42:59.597741: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


#### Data Preparation and Preprocessing
load and normalize images

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define dataset paths
train_dir = "/home/zyh/Fruit-Classifier/data/train_augment"    # directory with subfolders for each class (train set)
test_dir   = "/home/zyh/Fruit-Classifier/data/test"  # directory with subfolders for each class (validation set)

# Create ImageDataGenerators for loading images with normalization
train_datagen = ImageDataGenerator(rescale=1.0/255.0)  # normalize pixel values
val_datagen   = ImageDataGenerator(rescale=1.0/255.0)

# Flow images in batches from directories
batch_size = 32  # will be tuned via grid search
IMG_SIZE = 224   # image size (can be adjusted as needed)

train_generator = train_datagen.flow_from_directory(
    train_dir, target_size=(IMG_SIZE, IMG_SIZE), batch_size=batch_size,
    class_mode='categorical', shuffle=True)

val_generator = val_datagen.flow_from_directory(
    test_dir, target_size=(IMG_SIZE, IMG_SIZE), batch_size=batch_size,
    class_mode='categorical', shuffle=False)


Found 311 images belonging to 4 classes.
Found 60 images belonging to 4 classes.


model structure

In [3]:
from tensorflow.keras import layers, models, regularizers

def create_cnn_model(activation='relu', dropout_rate=0.2, l2_rate=0.0):
    model = models.Sequential()
    # Convolutional layers with chosen activation and optional L2 regularization
    model.add(layers.Conv2D(32, (3,3), padding='same', activation=activation, 
                             input_shape=(IMG_SIZE, IMG_SIZE, 3),
                             kernel_regularizer=regularizers.l2(l2_rate)))
    model.add(layers.MaxPooling2D(pool_size=2))
    model.add(layers.Conv2D(64, (3,3), padding='same', activation=activation,
                             kernel_regularizer=regularizers.l2(l2_rate)))
    model.add(layers.MaxPooling2D(pool_size=2))
    model.add(layers.Conv2D(128, (3,3), padding='same', activation=activation,
                             kernel_regularizer=regularizers.l2(l2_rate)))
    model.add(layers.MaxPooling2D(pool_size=2))
    # Flatten and Dense layers
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation=activation, 
                            kernel_regularizer=regularizers.l2(l2_rate)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(32, activation=activation, 
                            kernel_regularizer=regularizers.l2(l2_rate)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(4, activation='softmax'))  # 4 output classes
    return model

As shown above, we parameterize the activation function, dropout rate, and L2 regularization rate so that these can be tuned. By default, the original model used ReLU activations and a dropout rate of 0.2 in two places. We include the option to use Leaky ReLU as an alternative; if Leaky ReLU is selected, we will insert layers.LeakyReLU() layers after each linear layer (since Keras layers.Dense or Conv2D do not accept leaky_relu string directly, we would use activation=None and add a LeakyReLU layer manually).

Regularization: We have two forms of regularization to consider ‚Äì dropout and L2 weight decay. Dropout randomly zeros out a fraction of neurons during training to prevent co-adaptation of features, while L2 penalizes large weights. Both are known to help reduce overfitting. In the model, dropout layers are included as shown; L2 regularization is applied to convolutional and dense layers via kernel_regularizer=regularizers.l2(l2_rate). We will tune the dropout_rate (e.g., try values like 0.0 = no dropout vs 0.5) and the l2_rate (e.g., 0.0 = no L2 vs a small value like 0.001) as hyperparameters.

Initially, we keep other aspects constant (the number of layers/units as given). The model will be compiled with a chosen optimizer and learning rate (also to be tuned). For example, using Adam optimizer with a certain learning rate as in the original code:

In [4]:
model = create_cnn_model(activation='relu', dropout_rate=0.2, l2_rate=0.0)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)  # example learning rate
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1765550581.979449  414352 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7537 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


#### Hyperparameter Tuning with Grid Search and K-Fold Cross-Validation

To systematically find the best hyperparameter combination, we employ grid search over the specified hyperparameter ranges, coupled with K-fold cross-validation for robust evaluation. Grid search will exhaustively try all combinations of the provided hyperparameters, and K-fold CV means that for each combination, the training data is further split into K folds to evaluate the model‚Äôs performance across different subsets. This helps ensure the hyperparameter choice generalizes well and is not overfitting to one particular train/validation split.

Hyperparameters to tune:

Learning Rate: We will search values in the range 0.1 to 0.001. Given the prompt‚Äôs suggestion, we use a logarithmic scale: e.g. [0.1, 0.01, 0.001]. 

Optimizer: We consider two optimizers ‚Äì Stochastic Gradient Descent (SGD) and Adam. These represent different update algorithms; SGD could be used with momentum, but here we‚Äôll use plain SGD vs Adam.

Batch Size: Try [16, 32, 64]. Batch size affects training stability and speed.

Activation Function: Either ReLU or LeakyReLU. We will implement LeakyReLU with a negative slope (default 0.2) if chosen.

Dropout Rate: Try e.g. [0.0, 0.5] ‚Äì either no dropout or 50% dropout. (We could also test an intermediate like 0.2 as in the original.)

L2 Regularization (weight decay) factor: Try [0.0, 0.001] ‚Äì either no L2 penalty or a small penalty.

Given these choices, the grid has 3√ó2√ó3√ó2√ó2√ó2 = 144 possible combinations.  For demonstration, let's set K = 5 (5-fold cross-validation) by default.


In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

# Wrap the model creation in KerasClassifier for use in GridSearchCV
def build_model(learning_rate=0.01, optimizer_name='Adam', activation='relu', 
                dropout_rate=0.2, l2_rate=0.0):
    # Build the CNN model with given hyperparams
    model = create_cnn_model(activation=activation, dropout_rate=dropout_rate, l2_rate=l2_rate)
    # Choose optimizer
    if optimizer_name == 'Adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:  # 'SGD'
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# ÂÖ®ÂÜÖÂ≠òÊï∞ÊçÆÂä†ËΩΩÂáΩÊï∞ (ÈÄüÂ∫¶‰ºòÂåñÁöÑÂÖ≥ÈîÆ)
def load_data_to_memory(data_dir, img_size):
    print(f"Ê≠£Âú®Âä†ËΩΩÊï∞ÊçÆÂà∞ÂÜÖÂ≠ò: {data_dir} ...")
    X = []
    y = []
    # Á°Æ‰øùÁ±ªÂêçÈ°∫Â∫è‰∏ÄËá¥: ['apple', 'banana', 'mixed', 'orange']
    classes = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
    print(f"Ê£ÄÊµãÂà∞ÁöÑÁ±ªÂà´: {classes}")
    
    for label_idx, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        files = os.listdir(class_dir)
        for fname in files:
            if fname.lower().endswith(('.jpg', '.png', '.jpeg')):
                fpath = os.path.join(class_dir, fname)
                # Âä†ËΩΩÂπ∂Ë∞ÉÊï¥Â§ßÂ∞è
                img = load_img(fpath, target_size=(img_size, img_size))
                # ËΩ¨‰∏∫Êï∞ÁªÑÂπ∂ÂΩí‰∏ÄÂåñ (0-1)
                img_array = img_to_array(img) / 255.0
                
                X.append(img_array)
                y.append(label_idx)
                
    X = np.array(X)
    # Â∞ÜÊ†áÁ≠æËΩ¨‰∏∫ One-hot ÁºñÁ†Å (e.g., [0, 1, 0, 0])
    y = tf.keras.utils.to_categorical(np.array(y), num_classes=len(classes))
    print(f"Êï∞ÊçÆÂä†ËΩΩÂÆåÊàê! X shape: {X.shape}, y shape: {y.shape}")
    return X, y



# ‰∏ÄÊ¨°ÊÄßËØªÂèñÊâÄÊúâÊï∞ÊçÆÂà∞ RAM (320Âº†ÂõæÁ∫¶Âç†Áî® 150MB~200MB ÂÜÖÂ≠òÔºåÈùûÂ∏∏ÂÆâÂÖ®)
X_all, y_all = load_data_to_memory(train_dir, IMG_SIZE)


#  ÊûÅÈÄüÁâà Grid Search
param_grid = {
    'learning_rate': [0.01, 0.001,0.005],
    'optimizer_name': ['SGD', 'Adam'],
    'batch_size': [16, 32],
    'activation': ['relu', 'leaky_relu'],
    'dropout_rate': [0.0, 0.5],
    #'l2_rate': [0.0, 0.001]
}

# ËøôÈáåÁöÑ shuffle=True ÂæàÈáçË¶Å
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

best_acc = 0
best_params = None

# ÁîüÊàêÊâÄÊúâÁªÑÂêà
combinations = list(itertools.product(*param_grid.values()))
total_runs = len(combinations) * 5
current_run = 0

print(f"ÂºÄÂßã Grid SearchÔºåÊÄªÂÖ±Ë¶ÅËÆ≠ÁªÉ {len(combinations)} ‰∏™Ê®°ÂûãÈÖçÁΩÆ...")

for combo in combinations:
    params = dict(zip(param_grid.keys(), combo))
    
    # ‰ªé params ‰∏≠ÂàÜÁ¶ªÂá∫ batch_sizeÔºåÂõ†‰∏∫ÂÆÉ‰∏ç‰º†Áªô build_model
    build_args = {k: v for k, v in params.items() if k != 'batch_size'}
    current_batch_size = params['batch_size']
    
    acc_list = []
    
    # K-Fold Âæ™ÁéØ
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_all)):
        # Ê∏ÖÁêÜÊóßÊ®°ÂûãÔºåÈáäÊîæÊòæÂ≠ò (ÂÖ≥ÈîÆÊ≠•È™§ÔºÅ)
        tf.keras.backend.clear_session()
        
        # ÂàíÂàÜÊï∞ÊçÆ (Áõ¥Êé•ÂÜÖÂ≠òÂàáÁâáÔºåÈÄüÂ∫¶ÊûÅÂø´)
        X_train, X_val = X_all[train_idx], X_all[val_idx]
        y_train, y_val = y_all[train_idx], y_all[val_idx]
        
        # ÊûÑÂª∫Ê®°Âûã
        model = build_model(**build_args)
        
        # ÂõûË∞ÉÂáΩÊï∞
        callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss'),
            # ÁΩëÊ†ºÊêúÁ¥¢Êó∂ ReduceLROnPlateau ÂèØËÉΩÊãñÊÖ¢ÈÄüÂ∫¶ÔºåÁÆÄÂçïËµ∑ËßÅÂèØ‰ª•ÂÖàÊ≥®ÈáäÊéâÔºåÊàñËÄÖ‰øùÁïô
            # tf.keras.callbacks.ReduceLROnPlateau(patience=2) 
        ]
        
        # ËÆ≠ÁªÉ (Êó†ÈúÄ GeneratorÔºåÁõ¥Êé•‰º†Êï∞ÁªÑ)
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            batch_size=current_batch_size,
            epochs=10, 
            callbacks=callbacks,
            verbose=0  # ÈùôÈªòÊ®°ÂºèÔºåÂè™ÊâìÂç∞ÁªìÊûú
        )
        
        # ËØÑ‰º∞
        val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
        acc_list.append(val_acc)
        
        current_run += 1
        # ÊâìÂç∞ËøõÂ∫¶Êù°
        print(f"Run {current_run}/{total_runs} - Acc: {val_acc:.4f}")

    avg_acc = np.mean(acc_list)
    print(f"Params: {params} | Avg Acc: {avg_acc:.4f}")
    
    if avg_acc > best_acc:
        best_acc = avg_acc
        best_params = params
        print(f"üî•üî•üî• ÂèëÁé∞Êñ∞ÊúÄ‰Ω≥Á≤æÂ∫¶: {best_acc:.4f}")

print("\n========================================")
print("ÊêúÁ¥¢ÁªìÊùü")
print(f"ÊúÄ‰Ω≥Á≤æÂ∫¶: {best_acc}")
print(f"ÊúÄ‰Ω≥ÂèÇÊï∞: {best_params}")

A few notes on the above:

We defined build_model to accept the hyperparams. If activation='leaky_relu', inside create_cnn_model we would handle that by setting layers with no activation and adding LeakyReLU layers. (This implementation detail can be handled with an if inside create_cnn_model.)


------------------------------------------------------

 Ideally, each fold‚Äôs training could stop early if the model‚Äôs performance on that fold‚Äôs validation subset stops improving. The scikit-learn wrapper does not directly use the K-fold partition as a Keras validation in each fit call. A workaround is to use validation_split within each fold‚Äôs training or to manually perform the cross-validation loop. In our case, we set a relatively small number of epochs (10) for each training, assuming this is sufficient to evaluate performance without severe overfitting. If we wanted to incorporate early stopping in GridSearchCV, we could pass a callback through the fit_params. For example:

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# grid_search.fit(X_train, y_train, **{'callbacks': [early_stop], 'validation_split': 0.1})


This would use 10% of each fold‚Äôs training data as a validation for early stopping. However,  this means we‚Äôre not fully using that 10% for training in each fold (since it‚Äôs used as a temp validation), and the actual cross-val fold (held-out by GridSearch) isn‚Äôt directly used for early stopping. Due to these complexities, one might simply keep epochs low or perform manual K-fold training to properly utilize each fold‚Äôs validation. Given our epoch count is modest (10) and we have early stopping for the final training phase, we can omit early stopping during the grid search phase to simplify.

-------------------------------------------------------

#### Model Training with Best Hyperparameters and Early Stopping

In [None]:
# Build final model with best hyperparams
best_model = create_cnn_model(activation=best_params['activation'], 
                              dropout_rate=best_params['dropout_rate'], 
                              l2_rate=best_params['l2_rate'])
if best_params['optimizer_name'] == 'Adam':
    final_optimizer = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'])
else:
    final_optimizer = tf.keras.optimizers.SGD(learning_rate=best_params['learning_rate'])
best_model.compile(optimizer=final_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Setup callbacks for early stopping (and optional learning rate reduction, checkpoints as in original code)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7, verbose=1)
]

# Train the model with early stopping
history = best_model.fit(train_generator, epochs=50,  # start with an upper bound, early stopping will likely stop earlier
                         validation_data=val_generator, 
                         callbacks=callbacks, verbose=1)


Results and Visualization of Training Progress

In [None]:
val_loss, val_acc = best_model.evaluate(val_generator)
print(f"Validation Accuracy: {val_acc:.2%}, Validation Loss: {val_loss:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Extract history data
epochs = range(1, len(history.history['loss'])+1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

# Plot Loss Curves
plt.figure(figsize=(6,4))
plt.plot(epochs, train_loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.show()

# Plot Accuracy Curves
plt.figure(figsize=(6,4))
plt.plot(epochs, train_acc, 'bo-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.show()
