In [None]:
# This is the notebook used for making the inferences using the model trained.

# Import Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import load_model
import cv2
from sklearn.metrics import f1_score

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define paths to the dataset
BASE_PATH = '/kaggle/input/soil-classification-part-2/soil_competition-2025/'
TEST_PATH = BASE_PATH + 'test/'
TEST_IDS = BASE_PATH + 'test_ids.csv'

# Load test dataset
test_ids = pd.read_csv(TEST_IDS)
test_ids['image_path'] = test_ids['image_id'].apply(lambda x: os.path.join(TEST_PATH, x))

print(f"Test dataset shape: {test_ids.shape}")
print(f"Sample test image paths: {test_ids['image_path'].head()}")

# Load pre-trained models
print("Loading pre-trained models...")
feature_extractor = load_model('feature_extractor_model.h5')
autoencoder = load_model('autoencoder_model.h5')

# Load training features for threshold computation
train_features = np.load('train_features.npy')
val_features = np.load('val_features.npy')

print("Models loaded successfully!")

def load_and_preprocess_image(image_path):
    """Load and preprocess an image for feature extraction."""
    img = cv2.imread(image_path)
    if img is None:
        return np.zeros((224, 224, 3))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img.astype(np.float32) / 255.0
    return img

# Load and preprocess test images
print("Loading test images...")
test_images = np.array([load_and_preprocess_image(path) for path in test_ids['image_path']])
print(f"Test images shape: {test_images.shape}")

# Extract test features
print("Extracting test features...")
test_features = feature_extractor.predict(test_images, batch_size=32, verbose=1)
print(f"Test features shape: {test_features.shape}")

# Compute reconstruction errors
print("Computing reconstruction errors...")
train_recon = autoencoder.predict(train_features, batch_size=32, verbose=1)
train_errors = np.mean(np.square(train_features - train_recon), axis=1)

val_recon = autoencoder.predict(val_features, batch_size=32, verbose=1)
val_errors = np.mean(np.square(val_features - val_recon), axis=1)

test_recon = autoencoder.predict(test_features, batch_size=32, verbose=1)
test_errors = np.mean(np.square(test_features - test_recon), axis=1)

print(f"Training reconstruction errors - Mean: {np.mean(train_errors):.6f}, Std: {np.std(train_errors):.6f}")
print(f"Validation reconstruction errors - Mean: {np.mean(val_errors):.6f}, Std: {np.std(val_errors):.6f}")
print(f"Test reconstruction errors - Mean: {np.mean(test_errors):.6f}, Std: {np.std(test_errors):.6f}")

# Step 3: Dynamic Threshold Selection
# Simulate non-soil images in validation set by taking high-error samples
val_pseudo_labels = np.ones(len(val_errors))  # Start with all as soil (1)
error_threshold_for_pseudo = np.percentile(val_errors, 90)  # Top 10% errors as pseudo non-soil
val_pseudo_labels[val_errors > error_threshold_for_pseudo] = 0  # Label high errors as non-soil

print(f"Validation pseudo-labeling threshold: {error_threshold_for_pseudo:.6f}")
print(f"Pseudo soil labels: {np.sum(val_pseudo_labels)} ({np.mean(val_pseudo_labels)*100:.1f}%)")
print(f"Pseudo non-soil labels: {len(val_pseudo_labels) - np.sum(val_pseudo_labels)} ({(1 - np.mean(val_pseudo_labels))*100:.1f}%)")

# Test multiple thresholds and pick the best based on F1-score
thresholds = [np.percentile(train_errors, p) for p in [50, 75, 90]]
best_threshold = None
best_f1 = 0

print("Selecting best threshold based on validation F1-score...")
print("Threshold\tF1-Score")
print("-" * 25)

for threshold in thresholds:
    val_preds = np.where(val_errors <= threshold, 1, 0)
    f1 = f1_score(val_pseudo_labels, val_preds)
    print(f"{threshold:.6f}\t{f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\nBest threshold: {best_threshold:.6f} with F1-score: {best_f1:.4f}")

# Apply the best threshold to test predictions
test_preds = np.where(test_errors <= best_threshold, 1, 0)

# Create submission file
submission = pd.DataFrame({
    'image_id': test_ids['image_id'],
    'label': test_preds
})

# Save submission
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

# Display prediction statistics
print(f"\nPrediction Statistics:")
print(f"Soil predictions: {np.sum(test_preds)} ({np.mean(test_preds)*100:.1f}%)")
print(f"Non-soil predictions: {len(test_preds) - np.sum(test_preds)} ({(1 - np.mean(test_preds))*100:.1f}%)")

# Visualize reconstruction errors distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(train_errors, bins=50, alpha=0.7, color='blue', label='Train Errors')
plt.axvline(best_threshold, color='red', linestyle='--', label='Best Threshold')
plt.title('Training Reconstruction Errors')
plt.xlabel('Mean Squared Error')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 3, 2)
sns.histplot(val_errors, bins=50, alpha=0.7, color='green', label='Validation Errors')
plt.axvline(best_threshold, color='red', linestyle='--', label='Best Threshold')
plt.axvline(error_threshold_for_pseudo, color='orange', linestyle='--', label='Pseudo-labeling Threshold')
plt.title('Validation Reconstruction Errors')
plt.xlabel('Mean Squared Error')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 3, 3)
sns.histplot(test_errors, bins=50, alpha=0.7, color='purple', label='Test Errors')
plt.axvline(best_threshold, color='red', linestyle='--', label='Best Threshold')
plt.title('Test Reconstruction Errors')
plt.xlabel('Mean Squared Error')
plt.ylabel('Frequency')
plt.legend()

plt.tight_layout()
plt.show()

# Show first few predictions
print("\nFirst 10 predictions:")
print("Image ID\t\tReconstruction Error\tPrediction")
print("-" * 55)
for i in range(10):
    pred_label = "Soil" if test_preds[i] == 1 else "Non-Soil"
    print(f"{test_ids.iloc[i]['image_id']}\t{test_errors[i]:.6f}\t\t{pred_label}")

print("\nInference completed successfully!")