# CELL 1 - Initialization

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import optuna
import random
import kagglehub # Import for the new download method

# --- Constants and Setup ---

IMAGE_SIZE = 100 
BATCH_SIZE = 64
RANDOM_SEED = 42
NUM_EPOCHS_TUNE = 10 # Fewer epochs for quick HPO search
NUM_EPOCHS_FINAL = 50 # Max epochs for final training (with Early Stopping)

# Set random seed for reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

# Determine device (CUDA for GPU training)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
if DEVICE.type == 'cuda':
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")


# --- Kaggle Download and DATA_DIR Setup ---
DATASET_ID = "utkarshsaxenadn/fruits-classification"

print(f"\nDownloading Kaggle dataset ID: {DATASET_ID} using kagglehub...")
print("NOTE: This requires your Kaggle API key (kaggle.json) to be set up in ~/.kaggle/.")

try:
    # Use kagglehub to download the dataset
    path = kagglehub.dataset_download(DATASET_ID)
    
    # kagglehub returns the path to the root of the extracted dataset files
    DATA_DIR = str(path)
    
    print("Download complete.")
    print(f"Path to dataset files: {DATA_DIR}")
    
except Exception as e:
    print(f"Kaggle Download Failed. Error: {e}")
    print("Please ensure your Kaggle API key is correctly configured and the kernel is switched.")
    # Fallback path if the download fails for manual placement
    DATA_DIR = './fruits-classification' 
    print(f"Falling back to assumed manual data path: {DATA_DIR}")

Using device: cuda
CUDA device name: NVIDIA GeForce RTX 3060

Downloading Kaggle dataset ID: utkarshsaxenadn/fruits-classification using kagglehub...
NOTE: This requires your Kaggle API key (kaggle.json) to be set up in ~/.kaggle/.
Download complete.
Path to dataset files: C:\Users\Bence\.cache\kagglehub\datasets\utkarshsaxenadn\fruits-classification\versions\1


# CELL 2 - Preprocessing

In [32]:
# --- Data Transforms, Normalization, and Features (Data Augmentation) ---

# 1. PIL Image Transforms (Augmentation applied before Tensor conversion)
pil_transforms = [
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomRotation(15), 
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
]

# 2. Tensor Conversion
tensor_conversion = [
    transforms.ToTensor(),
]

# 3. Tensor Transforms (Augmentation and Normalization applied after Tensor conversion)
tensor_transforms = [
    # FIX: RandomErasing MUST be after ToTensor()
    transforms.RandomErasing(p=0.2, scale=(0.02, 0.1), ratio=(0.3, 3.3)), 
    # Normalization
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
]

# Training Transforms (All steps)
train_transforms = transforms.Compose(pil_transforms + tensor_conversion + tensor_transforms)

# Validation/Test Transforms (Only resize, ToTensor, Normalization)
test_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])


# --- Adatfolyam (ImageFolder Ã©s DataLoader) ---
TRAIN_DIR = DATA_DIR + '\\Fruits Classification\\train'
TEST_DIR = DATA_DIR + '\\Fruits Classification\\test'

train_data = datasets.ImageFolder(TRAIN_DIR, transform=train_transforms)
test_data = datasets.ImageFolder(TEST_DIR, transform=test_transforms)

# Split training data into training and validation sets (80/20 split)
train_size = int(0.8 * len(train_data))
val_size = len(train_data) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_data, [train_size, val_size])

# Create DataLoaders (Data Streaming)
train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

CLASS_NAMES = train_data.classes
NUM_CLASSES = len(CLASS_NAMES)
print(f"Number of Classes: {NUM_CLASSES}")

Number of Classes: 5


# CELL 3 - CNN Model Buildup

In [33]:
class FruitCNN(nn.Module):
    def __init__(self, num_conv_layers, filters, dense_units, dropout_rate, num_classes):
        super(FruitCNN, self).__init__()
        
        layers = []
        in_channels = 3 # RGB
        
        # Convolutional Block (Each block adds Conv + ReLU + MaxPool = 3 layers)
        for i in range(num_conv_layers):
            out_channels = filters * (2**i)
            # Conv2D Layer
            layers += [
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.ReLU(),
            ]
            # MaxPool2D Layer
            layers += [
                nn.MaxPool2d(kernel_size=2, stride=2)
            ]
            in_channels = out_channels
            
        self.features = nn.Sequential(*layers)
        
        # Calculate the size of the feature map after convolutional layers
        final_dim = IMAGE_SIZE // (2**num_conv_layers)
        flattened_size = out_channels * final_dim * final_dim
        
        # Fully Connected Layers (4 additional hidden/special layers)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            # Dropout Layer (Special Layer/Regularization)
            nn.Dropout(dropout_rate), 
            # Dense Layer
            nn.Linear(flattened_size, dense_units),
            nn.ReLU(),
            # Dropout Layer (Special Layer/Regularization)
            nn.Dropout(dropout_rate), 
            # Output Layer
            nn.Linear(dense_units, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# CELL 4 - Hyperparameter optimization

In [None]:
# --- Trainer Function for Optuna ---
def train_model_optuna(model, train_loader, val_loader, optimizer, criterion, epochs, device):
    """Simple training loop for HPO."""
    model.train()
    for epoch in range(epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            # FIX: labels already contains the class indices (1D tensor), no need for argmax
            loss = criterion(outputs, labels) 
            loss.backward()
            optimizer.step()

# --- Evaluator Function for Optuna ---
def evaluate_model_optuna(model, val_loader, device):
    """Simple evaluation loop for HPO."""
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            # FIX: labels already contains the class indices
            correct += (predicted == labels).sum().item() 
    return correct / total

# --- Optuna Objective Function ---
def objective(trial):
    # Hyperparameter Search Space
    hp_num_conv_layers = trial.suggest_int('num_conv_layers', 2, 4)
    hp_filters = trial.suggest_categorical('filters', [32, 64, 128])
    hp_dense_units = trial.suggest_categorical('dense_units', [128, 256, 512])
    hp_dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5, step=0.1)
    hp_learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)

    # Model and Setup
    model = FruitCNN(
        num_conv_layers=hp_num_conv_layers,
        filters=hp_filters,
        dense_units=hp_dense_units,
        dropout_rate=hp_dropout_rate,
        num_classes=NUM_CLASSES
    ).to(DEVICE)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=hp_learning_rate)
    
    # Training and Evaluation
    train_model_optuna(model, train_loader, val_loader, optimizer, criterion, NUM_EPOCHS_TUNE, DEVICE)
    accuracy = evaluate_model_optuna(model, val_loader, DEVICE)

    return accuracy

# --- Run Optuna Search ---
print("Starting Hyperparameter Optimization (HPO) with Optuna...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) 

print("\n--- Best Trial Results ---")
best_trial = study.best_trial
print(f"Best Validation Accuracy: {best_trial.value:.4f}")
print("Best Hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")

# Retrieve best hyperparameters
best_hps = best_trial.params

[I 2025-12-11 22:24:07,689] A new study created in memory with name: no-name-3da1f2db-0c2e-4760-88b3-25481f926e6e


Starting Hyperparameter Optimization (HPO) with Optuna...


  hp_learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[W 2025-12-11 22:24:16,614] Trial 0 failed with parameters: {'num_conv_layers': 2, 'filters': 32, 'dense_units': 256, 'dropout_rate': 0.5, 'learning_rate': 0.0010835199591891294} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\mscmlbead\fruit_venv\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Bence\AppData\Local\Temp\ipykernel_11084\3661422097.py", line 54, in objective
    train_model_optuna(model, train_loader, val_loader, optimizer, criterion, NUM_EPOCHS_TUNE, DEVICE)
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Bence\AppData\Local\Temp\ipykernel_11084\3661422097.py", line 6, in train_model_optuna
    for inputs, labels in train_loader:
                          ^^^^^^^^^^^^
  File "c:\mscmlbead\fruit_venv\Lib\s

KeyboardInterrupt: 

# CELL 5 - Training, Testing, Validation and Final Evaluation

In [None]:
# --- Final Model Initialization ---
final_model = FruitCNN(
    num_conv_layers=best_hps['num_conv_layers'],
    filters=best_hps['filters'],
    dense_units=best_hps['dense_units'],
    dropout_rate=best_hps['dropout_rate'],
    num_classes=NUM_CLASSES
).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_hps['learning_rate'])

# --- Custom Early Stopping Logic (Low-Level Logic) ---
def train_and_validate(model, train_loader, val_loader, criterion, optimizer, epochs, device, patience=10):
    
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_weights = None
    
    print("Starting Final Training with Early Stopping...")
    
    for epoch in range(1, epochs + 1):
        # Training Phase
        model.train()
        total_train_loss = 0
        for inputs, labels in train_loader:
            # FIX: labels already contains the class indices
            inputs, labels = inputs.to(device), labels.to(device) 
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)

        # Validation Phase
        model.eval()
        total_val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                # FIX: labels already contains the class indices
                correct += (predicted == labels).sum().item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct / total
        
        # Log History
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)
        
        print(f'Epoch {epoch}/{epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        
        # Early Stopping Logic 
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            # Save the best weights
            best_model_weights = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"Early stopping at epoch {epoch}. Validation loss did not improve for {patience} epochs.")
                # Restore best weights
                model.load_state_dict(best_model_weights)
                break

    return model, history

# --- Run Final Training ---
final_model, history = train_and_validate(
    final_model, 
    train_loader, 
    val_loader, 
    criterion, 
    optimizer, 
    NUM_EPOCHS_FINAL, 
    DEVICE
)

# CELL 6 - Visualization

In [None]:
# --- Model Evaluation on Test Set ---
def evaluate_test(model, test_loader, device, class_names):
    model.eval()
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Labels need to be indices
            inputs, labels = inputs.to(device), labels.to(device).argmax(dim=1)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            
    # Calculate Overall Accuracy Score
    overall_accuracy = accuracy_score(y_true, y_pred)
    print(f"\nOverall Test Accuracy Score: {overall_accuracy*100:.2f}%")

    # Classification Report (F-Score, Recall, Precision)
    print("\n--- Classification Report ---")
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))

    # Confusion Matrix (Visualization)
    print("\n--- Confusion Matrix ---")
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return overall_accuracy

# --- Run Test Evaluation ---
test_accuracy = evaluate_test(final_model, test_loader, DEVICE, CLASS_NAMES)


# --- Plot Training History ---
# Plot Training History (Visualization)
def plot_history(history):
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))

    # Plot Loss
    ax[0].plot(history['train_loss'], label='Train Loss')
    ax[0].plot(history['val_loss'], label='Validation Loss')
    ax[0].set_title('Training and Validation Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss')
    ax[0].legend()

    # Plot Accuracy
    ax[1].plot(history['val_accuracy'], label='Validation Accuracy')
    ax[1].set_title('Validation Accuracy')
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')
    ax[1].legend()

    plt.tight_layout()
    plt.show()

print("\n--- Training History Visualization ---")
plot_history(history)