In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

soil_classification_path = kagglehub.competition_download('soil-classification')

print('Data source import complete.')


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


In [None]:
# Image parameters
IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 32

# Dataset directories
BASE_DIR = '/kaggle/input/soil-classification/soil_classification-2025'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
TEST_DIR = os.path.join(BASE_DIR, 'test')


In [None]:
import tensorflow as tf
import os
from glob import glob

# Test directory - point to the folder containing your images
train_dir = '/kaggle/input/soil-classification/soil_classification-2025/train'  # Update if needed

# Get all image paths (jpg, jpeg, webp)
image_paths = sorted(
    glob(os.path.join(train_dir, "*.jpg")) +     ##find all images and sort them alphabeticaaly
    glob(os.path.join(train_dir, "*.jpeg")) +      ##ex: img1.jpg,img3.jpeg,img2.wbep  (arrays)
    glob(os.path.join(train_dir, "*.webp"))
)

print(f"Found {len(image_paths)} test images")

def load_and_preprocess_image(path):
    # Read the raw file
    img = tf.io.read_file(path)

    # Try to decode based on file extension
    if path.lower().endswith(('.jpg', '.jpeg')):
        img = tf.image.decode_jpeg(img, channels=3)    ##channels=3 ie 3 colors red,green,blue
    elif path.lower().endswith('.webp'):
        # For WebP, we might need to use a different approach
        try:
            # Try decode_image first (works in TF 2.10+)
            img = tf.image.decode_image(img, channels=3, expand_animations=False)
        except:
            # Fallback for older TF versions - may require additional dependencies
            import webp
            img = tf.numpy_function(lambda x: webp.load_image(x, mode='RGB'), [path], tf.uint8)
    else:
        raise ValueError(f"Unsupported image format: {path}")

    img = tf.image.resize(img, [224, 224])
    img = img / 255.0  # Normalize to [0,1]  convert pixel from[0,255] to [0,1] for faster training
    return img

# Build train dataset
train_ds = tf.data.Dataset.from_tensor_slices(image_paths)
train_ds = train_ds.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE) #num_parallel_calls=tf.data.AUTOTUNE tells TensorFlow to automatically tune how many parallel calls to make for best performance.


train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE) #  Batches the data: each batch contains 32 images.

                                                         #Prefetches data for faster performance: while the model is training on one batch, the next batch is prepared in the background.

#                                                            AUTOTUNE again optimizes performance automatically.

# Verify first few images
for i, image in enumerate(train_ds.take(1)):
    print(f"Batch {i} shape: {image.shape}")
    print(f"Pixel range: {tf.reduce_min(image):.2f} to {tf.reduce_max(image):.2f}")

In [None]:
import tensorflow as tf
import os
from glob import glob

# Test directory - point to the folder containing your images
test_dir = '/kaggle/input/soil-classification/soil_classification-2025/test'  # Update if needed

# Get all image paths (jpg, jpeg, webp)
image_paths = sorted(
    glob(os.path.join(test_dir, "*.jpg")) +
    glob(os.path.join(test_dir, "*.jpeg")) +
    glob(os.path.join(test_dir, "*.webp"))
)

print(f"Found {len(image_paths)} test images")

def load_and_preprocess_image(path):
    # Read the raw file
    img = tf.io.read_file(path)

    # Try to decode based on file extension
    if path.lower().endswith(('.jpg', '.jpeg')):
        img = tf.image.decode_jpeg(img, channels=3)
    elif path.lower().endswith('.webp'):
        # For WebP, we might need to use a different approach
        try:
            # Try decode_image first (works in TF 2.10+)
            img = tf.image.decode_image(img, channels=3, expand_animations=False)
        except:
            # Fallback for older TF versions - may require additional dependencies
            import webp
            img = tf.numpy_function(lambda x: webp.load_image(x, mode='RGB'), [path], tf.uint8)
    else:
        raise ValueError(f"Unsupported image format: {path}")

    img = tf.image.resize(img, [224, 224])
    img = img / 255.0  # Normalize to [0,1]
    return img

# Build test dataset
test_ds = tf.data.Dataset.from_tensor_slices(image_paths)
test_ds = test_ds.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# Verify first few images
for i, image in enumerate(test_ds.take(1)):
    print(f"Batch {i} shape: {image.shape}")
    print(f"Pixel range: {tf.reduce_min(image):.2f} to {tf.reduce_max(image):.2f}")

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
test_ds = v=test_ds.cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
'''import pandas as pd
import tensorflow as tf
import os

# Load labels
df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")

# Label mapping
label_to_index = {
    'Alluvial soil': 0,
    'Black Soil': 1,
    'Clay soil': 2,
    'Red soil': 3
}
df['label'] = df['soil_type'].map(label_to_index)  ## label
                                                   #-----------
                                                   ##alluvial,balck,red,alluvial--> 0,1,3,0


# File paths and labels
image_paths = [os.path.join("/kaggle/input/soil-classification/soil_classification-2025/train", img_id) for img_id in df['image_id']]
labels = tf.keras.utils.to_categorical(df['label'], num_classes=4).astype('float32')  #Converts numeric labels into one-hot encoded vectors.
                                                                                       #Example: label 2 → [0, 0, 1, 0].
# ✅ Explicit cast
def decode_image_safe(filename, label):
    def _load_image(filename_str, label_tensor):
        try:
            image = tf.io.read_file(filename_str.numpy().decode())
            image = tf.image.decode_jpeg(image, channels=3)
            image = tf.image.resize(image, [224, 224])
            image = image / 255.0
        except Exception as e:
            image = tf.zeros([224, 224, 3])   ## if corrupt or non decoded return empty image with no label
            label_tensor = tf.zeros([4])
        return image, tf.cast(label_tensor, tf.float32)  # ✅ Cast to float32

    image, label = tf.py_function(_load_image, [filename, label], [tf.float32, tf.float32])
    image.set_shape([224, 224, 3])
    label.set_shape([4])
    return image, label



# --- Dataset creation ---
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32

train_ds = tf.data.Dataset.from_tensor_slices((image_paths, labels)) \
    .shuffle(1000) \
    .map(decode_image_safe, num_parallel_calls=AUTOTUNE) \
    .batch(BATCH_SIZE) \
    .prefetch(AUTOTUNE)'''


import pandas as pd
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split

# Load labels
df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")

# Label mapping
label_to_index = {
    'Alluvial soil': 0,
    'Black Soil': 1,
    'Clay soil': 2,
    'Red soil': 3
}
df['label'] = df['soil_type'].map(label_to_index)

# File paths and labels
image_paths = [os.path.join("/kaggle/input/soil-classification/soil_classification-2025/train", img_id) for img_id in df['image_id']]
labels = df['label'].values  # Keep numeric for stratified split

# Split into training and validation sets (70:30)
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths,
    labels,
    test_size=0.3,
    stratify=labels,
    random_state=42
)

# Convert labels to one-hot
train_labels_cat = tf.keras.utils.to_categorical(train_labels, num_classes=4).astype('float32')
val_labels_cat = tf.keras.utils.to_categorical(val_labels, num_classes=4).astype('float32')

# --- Image loading ---
def decode_image_safe(filename, label):
    def _load_image(filename_str, label_tensor):
        try:
            image = tf.io.read_file(filename_str.numpy().decode())
            image = tf.image.decode_jpeg(image, channels=3)
            image = tf.image.resize(image, [224, 224])
            image = image / 255.0
        except Exception:
            image = tf.zeros([224, 224, 3])
            label_tensor = tf.zeros([4])
        return image, tf.cast(label_tensor, tf.float32)

    image, label = tf.py_function(_load_image, [filename, label], [tf.float32, tf.float32])
    image.set_shape([224, 224, 3])
    label.set_shape([4])
    return image, label

# --- Dataset creation ---
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32

train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels_cat)) \
    .shuffle(1000) \
    .map(decode_image_safe, num_parallel_calls=AUTOTUNE) \
    .batch(BATCH_SIZE) \
    .prefetch(AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_labels_cat)) \
    .map(decode_image_safe, num_parallel_calls=AUTOTUNE) \
    .batch(BATCH_SIZE) \
    .prefetch(AUTOTUNE)

print(f"Train samples: {len(train_paths)}")
print(f"Validation samples: {len(val_paths)}")




In [None]:
# example simple model






from tensorflow.keras import layers, models
from tensorflow.keras import optimizers

model = models.Sequential([
    layers.Input(shape=(224, 224, 3)),

    # Block 1
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.25),

    # Block 2
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.25),

    # Block 3
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.4),

    # Dense
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(4, activation='softmax')  # assuming 4 soil classes
])

model.compile(
    optimizer='Adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)



history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=4,

)






In [None]:
import os
import pandas as pd

test_dir = '/kaggle/input/soil-classification/soil_classification-2025/test'
test_df = pd.read_csv('/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv')  # or however you're loading test image list

# ✅ FIX FILE PATHS HERE
test_df['image_path'] = test_df['image_id'].apply(lambda x: os.path.join(test_dir, x))

# Now create your test dataset using test_df['image_path']
def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = image / 255.0
    return image

test_ds = tf.data.Dataset.from_tensor_slices(test_df['image_path'].values)
test_ds = test_ds.map(lambda x: load_and_preprocess_image(x)).batch(32)

def safe_load_and_preprocess_image(path):
    def _load_image(path_str):
        try:
            image = tf.io.read_file(path_str)
            image = tf.image.decode_image(image, channels=3, expand_animations=False)
            image = tf.image.resize(image, [224, 224])
            image = image / 255.0
            return image
        except:
            # If error occurs, return a blank image
            return tf.zeros([224, 224, 3], dtype=tf.float32)

    return tf.py_function(_load_image, [path], tf.float32)

# Assume test_df contains a column "image_path" with full paths
test_ds = tf.data.Dataset.from_tensor_slices(test_df['image_path'].values)
test_ds = test_ds.map(lambda x: safe_load_and_preprocess_image(x))
test_ds = test_ds.batch(32)




In [None]:
# Predict probabilities
pred_probs = model.predict(test_ds, verbose=1)

# Get predicted class indices
pred_labels = tf.argmax(pred_probs, axis=1).numpy()



In [None]:
import pandas as pd
import os

# Soil label mapping
label_map = {
    0: 'Alluvial soil',
    1: 'Black Soil',
    2: 'Clay soil',
    3: 'Red soil'
}

# Step 1: Extract filename from path
test_df['filename'] = test_df['image_path'].apply(lambda x: os.path.basename(x))

# Step 2: Map predicted labels to soil names
pred_soil_names = [label_map[label] for label in pred_labels]

# Step 3: Create result DataFrame with readable soil types
results_df = pd.DataFrame({
    'image_id': test_df['filename'],
    'soil_type': pred_soil_names
})




output_file = '/kaggle/working/submission.csv'

results_df.to_csv(output_file, index=False)




In [None]:

import pandas as pd
df = pd.read_csv('/kaggle/working/submission.csv')
df.head()











In [None]:
import numpy as np

# Get model predictions (assuming pred_labels is already computed)


# Optional: Check prediction distribution
import pandas as pd
print("Prediction distribution:")
print(pd.Series(pred_labels).value_counts())


In [None]:
import numpy as np

# Try this alternative way to extract labels if the structure is different
y_true = np.array([])
for batch in test_ds:
    if isinstance(batch, tuple):  # If batch contains (images, labels)
        _, labels = batch
        y_true = np.concatenate([y_true, labels.numpy()])
    else:  # If batch contains only images
        print("Warning: No labels found in test_ds")
        break

if len(y_true) > 0:
    # Convert if one-hot encoded
    if len(y_true.shape) > 1 and y_true.shape[1] > 1:
        y_true_labels = np.argmax(y_true, axis=1)
    else:
        y_true_labels = y_true

    # Compute F1 score
    from sklearn.metrics import f1_score
    f1 = f1_score(y_true_labels, pred_labels, average='weighted')
    print(f"F1 Score: {f1:.4f}")
else:
    print("Cannot compute F1 - no labels found in test_ds")

In [None]:
# Inspect the first batch
sample_batch = next(iter(test_ds))
print("Batch structure:", sample_batch)

# Check if it's a tuple (images, labels) or just images
if isinstance(sample_batch, tuple):
    print("Dataset contains labels (but might be empty)")
else:
    print("Dataset contains only images - no labels available")

In [None]:
import pandas as pd

# Check class distribution in predictions
print("Predicted class distribution:")
print(pd.Series(pred_labels).value_counts())

# Visualize (example for 5 classes)
import matplotlib.pyplot as plt
pd.Series(pred_labels).value_counts().plot(kind='bar')
plt.title("Predicted Class Distribution")
plt.show()

In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    plt.figure(figsize=(14, 5))

    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    if 'val_accuracy' in history.history:
        plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title('AccuraCy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.legend()
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.show()


# Call this after model.fit()
plot_training_history(history)


In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Step 1: Get predictions and true labels from val_ds
y_true = []
y_pred = []

for images, labels in val_ds:
    preds = model.predict(images, verbose=0)
    y_true.extend(tf.argmax(labels, axis=1).numpy())      # actual class indices
    y_pred.extend(tf.argmax(preds, axis=1).numpy())       # predicted class indices

# Step 2: Compute F1 Score
f1 = f1_score(y_true, y_pred, average='macro')  # use 'weighted' or 'micro' as needed
print(f"F1 Score (macro): {f1:.4f}")


import pandas as pd
import os

# ---------------------------------------------
# Step 1: Define a mapping from numeric labels to soil type names
# ---------------------------------------------
# These labels are the human-readable equivalents of your model's numeric outputs.
label_map = {
    0: 'Alluvial Soil',
    1: 'Black Soil',
    2: 'Clay Soil',
    3: 'Red Soil'
}

# ---------------------------------------------
# Step 2: Extract filenames from image paths
# ---------------------------------------------
# The image_path column contains full paths like "/path/to/image.jpg"
# We only need the filename, like "image.jpg", for submission or display.
test_df['filename'] = test_df['image_path'].apply(lambda x: os.path.basename(x))

# ---------------------------------------------
# Step 3: Convert predicted numeric labels to soil names
# ---------------------------------------------
# Your model likely predicted values like 0, 1, 2, 3.
# Use the mapping above to convert those numbers into meaningful names.
pred_soil_names = [label_map[label] for label in pred_labels]

# ---------------------------------------------
# Step 4: Create the final results DataFrame
# ---------------------------------------------
# This DataFrame contains the cleaned image filenames and their corresponding predicted soil types.
results_df = pd.DataFrame({
    'image_id': test_df['filename'],     # Just the filename, not full path
    'soil_type': pred_soil_names         # Human-readable predicted soil type
})

# ---------------------------------------------
# Step 5: Save the results to a CSV file
# ---------------------------------------------
# This file can be submitted or used for further analysis.
output_path = '/kaggle/working/output/submission.csv'
results_df.to_csv(output_path, index=False)

# Confirmation
print(f"✅ Results saved to: {output_path}")
