In [10]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Paths:
DATA_DIR = "/kaggle/input/soil-classification/soil_classification-2025"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")
LABELS_FILE = os.path.join(DATA_DIR, "train_labels.csv")
TEST_IDS_FILE = os.path.join(DATA_DIR, "test_ids.csv")


# Configuration:
IMG_SIZE = 128
BATCH_SIZE = 32
EPOCHS = 15

# Load training labels:
df = pd.read_csv(LABELS_FILE)
df["filepath"] = df["image_id"].apply(lambda x: os.path.join(TRAIN_DIR, x))

# Converting all entries to binary label (soil = 1)
df["label"] = 1

# Image loading function:
def load_image(path):
    img = Image.open(path).convert("RGB").resize((IMG_SIZE, IMG_SIZE))
    return np.array(img).astype(np.float32) / 255.0

# Load training images:
X = np.array([load_image(p) for p in df["filepath"]])
y = np.array(df["label"])

# Train/validation split:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)

# Build CNN model:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=(IMG_SIZE, IMG_SIZE, 3)),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(64, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model:
model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), verbose=1)

# Evaluate model:
y_val_pred_prob = model.predict(X_val)
y_val_pred = (y_val_pred_prob > 0.5).astype(int)

print("\nClassification Report:\n")
print(classification_report(y_val, y_val_pred))

f1 = f1_score(y_val, y_val_pred)
print(f"\n F1 Score on validation: {f1:.4f}")

# Save model:
model.save("soil_binary_classifier.h5")
print("Model saved as 'soil_binary_classifier.h5'")

# Predict on test set:
test_df = pd.read_csv(TEST_IDS_FILE)
test_df["filepath"] = test_df["image_id"].apply(lambda x: os.path.join(TEST_DIR, x))
X_test = np.array([load_image(p) for p in test_df["filepath"]])
y_test_pred_prob = model.predict(X_test)
y_test_labels = (y_test_pred_prob > 0.5).astype(int).flatten()

# Generate submission file:
submission = pd.DataFrame({
    "image_id": test_df["image_id"],
    "is_soil": y_test_labels  # binary 1 = soil, 0 = not soil
})
submission.to_csv("submission.csv", index=False)
print(" submission.csv created and ready for submission")


Epoch 1/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.8788 - loss: 0.0999 - val_accuracy: 1.0000 - val_loss: 1.1602e-16
Epoch 2/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 1.1619e-14 - val_accuracy: 1.0000 - val_loss: 8.0215e-18
Epoch 3/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 1.8610e-15 - val_accuracy: 1.0000 - val_loss: 7.1651e-18
Epoch 4/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 2.0574e-15 - val_accuracy: 1.0000 - val_loss: 7.1350e-18
Epoch 5/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 1.5746e-15 - val_accuracy: 1.0000 - val_loss: 7.1343e-18
Epoch 6/15
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 1.9089e-15 - val_accuracy: 1.0000 - val_loss: 7