In [None]:
# This is the notebook used for training the model.

# Import Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
from sklearn.metrics import f1_score
import pickle

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define paths to the dataset
BASE_PATH = '/kaggle/input/soil-classification-part-2/soil_competition-2025/'
TRAIN_PATH = BASE_PATH + 'train/'
TRAIN_LABELS = BASE_PATH + 'train_labels.csv'

# Load the training dataset
train_labels = pd.read_csv(TRAIN_LABELS)
train_labels['image_path'] = train_labels['image_id'].apply(lambda x: os.path.join(TRAIN_PATH, x))

print(f"Training dataset shape: {train_labels.shape}")
print(f"Sample image paths: {train_labels['image_path'].head()}")

# Step 1: Feature Extraction using Pre-trained EfficientNetB0
def create_feature_extractor():
    """Create a feature extractor using EfficientNetB0 pre-trained on ImageNet."""
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    model = Model(inputs=base_model.input, outputs=x)
    return model

def load_and_preprocess_image(image_path):
    """Load and preprocess an image for feature extraction."""
    img = cv2.imread(image_path)
    if img is None:
        return np.zeros((224, 224, 3))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img.astype(np.float32) / 255.0
    return img

# Data augmentation for training images
datagen = ImageDataGenerator(
    rotation_range=20,
    brightness_range=[0.8, 1.2],
    zoom_range=0.2,
    horizontal_flip=True
)

# Load and preprocess training images
print("Loading training images...")
train_images = np.array([load_and_preprocess_image(path) for path in train_labels['image_path']])
print(f"Training images shape: {train_images.shape}")

# Apply augmentation to training images
print("Applying data augmentation...")
augmented_images = []
for i, img in enumerate(train_images):
    if i % 100 == 0:
        print(f"Augmenting image {i}/{len(train_images)}")
    img = img.reshape((1,) + img.shape)  # Reshape for ImageDataGenerator
    for batch in datagen.flow(img, batch_size=1):
        augmented_images.append(batch[0])
        break  # Take only one augmented image per original
augmented_images = np.array(augmented_images)

# Combine original and augmented images
all_train_images = np.concatenate([train_images, augmented_images], axis=0)
print(f"Total training images after augmentation: {all_train_images.shape}")

# Extract features using EfficientNetB0
print("Creating feature extractor...")
feature_extractor = create_feature_extractor()

print("Extracting training features...")
all_train_features = feature_extractor.predict(all_train_images, batch_size=32, verbose=1)
print(f"Training features shape: {all_train_features.shape}")

# Split into train and validation sets (80-20 split)
val_size = int(0.2 * len(all_train_features))
train_features = all_train_features[:-val_size]
val_features = all_train_features[-val_size:]

print(f"Training features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# Step 2: Autoencoder for Anomaly Detection
def create_autoencoder(input_dim):
    """Create a lightweight autoencoder for anomaly detection."""
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(64, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='linear')(encoded)
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

# Train the autoencoder
print("Training autoencoder...")
autoencoder = create_autoencoder(train_features.shape[1])

# Display model architecture
print("Autoencoder Architecture:")
autoencoder.summary()

# Train the autoencoder
history = autoencoder.fit(
    train_features, train_features,
    epochs=30,  # Reduced epochs for faster training
    batch_size=32,
    verbose=1,
    validation_split=0.2
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss (Log Scale)')
plt.plot(history.history['val_loss'], label='Validation Loss (Log Scale)')
plt.title('Autoencoder Training Loss (Log Scale)')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.yscale('log')
plt.legend()

plt.tight_layout()
plt.show()

# Save the trained models
print("Saving trained models...")
feature_extractor.save('feature_extractor_model.h5')
autoencoder.save('autoencoder_model.h5')

# Save features for inference
np.save('train_features.npy', train_features)
np.save('val_features.npy', val_features)

print("Training completed successfully!")
print("Saved files:")
print("- feature_extractor_model.h5")
print("- autoencoder_model.h5") 
print("- train_features.npy")
print("- val_features.npy")