In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load train CSV with labels
df = pd.read_csv('/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv')
df['image_id'] = df['image_id'].apply(lambda x: f"/kaggle/input/soil-classification/soil_classification-2025/train/{x}")

# Load test CSV (only image_ids)
test_df = pd.read_csv('/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv')
test_df['image_id'] = test_df['image_id'].apply(lambda x: f"/kaggle/input/soil-classification/soil_classification-2025/test/{x}")

This part loads the training and test data CSVs. Each row represents an image ID.
Then, we attach the full file paths to each image for easy access during training and prediction.

In [None]:
pd.set_option('display.max_colwidth', None)
df.head()

This code increases the column width for displaying data in Jupyter and shows the first few rows of the training dataframe.
It helps ensure that the image paths and labels have been correctly processed.

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)

train_data = train_datagen.flow_from_dataframe(
    dataframe=df,
    x_col='image_id',
    y_col='soil_type',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=True
)

This code:

Normalizes image pixels.

Loads images and labels from a DataFrame.

Resizes them to 224x224.

Batches them for training (32 images per batch).

Converts labels to one-hot encoded format.

Shuffles the data for each epoch.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')  # 4 classes of soil
])

This model:

Is a typical CNN for image classification.

Has 3 convolutional blocks for feature extraction.

Ends with fully connected layers for classification.

Outputs probabilities for 4 soil types.

Flatten(): Converts 3D tensor to 1D for dense layers.

Dense(128): Fully connected layer with 128 neurons and ReLU activation.

Dropout(0.5): Randomly drops 50% of neurons during training to reduce overfitting.

Dense(4, activation='softmax'): Final output layer with 4 units (for 4 classes), using softmax to output class probabilities.

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(
    train_data,
    epochs=20
)

* optimizer=Adam(learning_rate=0.001)
Adam is an adaptive learning rate optimizer that works well for most deep learning tasks.

* learning_rate=0.001 is a typical starting point — it controls how fast the model learns.

* loss='categorical_crossentropy'

  
Since the model is doing multi-class classification (4 soil types) and labels are one-hot encoded (class_mode='categorical'), this is the correct loss function.
* metrics=['accuracy']
Tracks model performance by computing how many predictions match the true labels.

In [None]:
# After training your model
model.save('soil_classification_model.h5')  # saves in HDF5 format