In [1]:
#Team: Aditi Goel
#Team Members: 
#1. Aditi Goel (Team Leader)
#2. Dr. Anil Kumar
#3. Sonakshi Mehrotra
#4. Jaya Kamboj

In [2]:
# Import necessary libraries
from PIL import Image
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
# Data Reading and Initial Pre-processing
# Images resized for easier handling
IMAGE_SIZE = (256, 256)

# Paths
base_dir = "/kaggle/input/soil-classification/soil_classification-2025"
train_img_dir = os.path.join(base_dir, "train")
test_img_dir = os.path.join(base_dir, "test")

# Load CSVs
train_labels_df = pd.read_csv(os.path.join(base_dir, "train_labels.csv"))
test_ids_df = pd.read_csv(os.path.join(base_dir, "test_ids.csv"))

# Fetch training data with resizing
train_images = []
train_labels = []

for idx, row in train_labels_df.iterrows():
    image_filename = row["image_id"]
    label = row["soil_type"]
    img_path = os.path.join(train_img_dir, image_filename)

    if os.path.exists(img_path):
        image = Image.open(img_path).convert("RGB")
        image = image.resize(IMAGE_SIZE)  # Resize to 256x256
        train_images.append(np.array(image))
        train_labels.append(label)
    else:
        print(f"Missing training image: {img_path}")

# Fetch test data with resizing
test_images = []
test_image_ids = []

for idx, row in test_ids_df.iterrows():
    image_filename = row["image_id"]
    img_path = os.path.join(test_img_dir, image_filename)

    if os.path.exists(img_path):
        image = Image.open(img_path).convert("RGB")
        image = image.resize(IMAGE_SIZE)
        test_images.append(np.array(image))
        test_image_ids.append(image_filename)
    else:
        print(f"Missing test image: {img_path}")

print(f"Loaded {len(train_images)} training images and {len(test_images)} test images (resized to 256x256).")


Loaded 1222 training images and 341 test images (resized to 256x256).


In [4]:
# Pre-processing steps followed
# Step 1: Convert image lists to numpy arrays
X_train = np.array(train_images)
X_test = np.array(test_images)

# Step 2: Flatten images (256x256x3 -> 196608) to 1-D arrays
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Step 3: Encode labels from string to numeric
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_labels)

In [5]:
# Train-Validation Split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_flat,
    y_train_encoded,
    test_size=0.1,   #90% train - 10% validation split
    random_state=42,
    stratify=y_train_encoded  #stratifying for uniform data division in train & validation datasets
)

print(f"Train size: {X_train_split.shape[0]}")
print(f"Validation size: {X_val_split.shape[0]}")

Train size: 1099
Validation size: 123


In [6]:
# CLASSIFICATION USING RANDOM FOREST MODEL

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=25, random_state=42)

# Train the model
rf_model.fit(X_train_split, y_train_split)

# Predict on the test set
y_pred_rf = rf_model.predict(X_val_split)

# Evaluate the model
print("Random Forest Classification Report:\n", classification_report(y_val_split, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_val_split, y_pred_rf))
print("Accuracy Score:", accuracy_score(y_val_split, y_pred_rf))


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        53
           1       1.00      0.96      0.98        23
           2       1.00      1.00      1.00        20
           3       1.00      0.96      0.98        27

    accuracy                           0.98       123
   macro avg       0.99      0.98      0.99       123
weighted avg       0.98      0.98      0.98       123

Confusion Matrix:
 [[53  0  0  0]
 [ 1 22  0  0]
 [ 0  0 20  0]
 [ 1  0  0 26]]
Accuracy Score: 0.983739837398374


In [7]:
# PREDICTION ON NEW DATA (TEST DATA)
# Step 1: Predict on the flattened test data
test_predictions_encoded = rf_model.predict(X_test_flat)

# Step 2: Decode the predictions
test_predictions_labels = label_encoder.inverse_transform(test_predictions_encoded)

# Step 3: Create a submission DataFrame
submission_df = pd.DataFrame({
    "image_id": test_image_ids,
    "soil_type": test_predictions_labels
})

# Display the first few rows of the submission DataFrame
print("\nSubmission DataFrame (first 5 rows):")
print(submission_df.head())

# Save the submission DataFrame to a CSV file
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission file created: submission.csv")


Submission DataFrame (first 5 rows):
            image_id      soil_type
0  img_cdf80d6f.jpeg  Alluvial soil
1   img_c0142a80.jpg  Alluvial soil
2   img_91168fb0.jpg  Alluvial soil
3   img_9822190f.jpg  Alluvial soil
4  img_e5fc436c.jpeg  Alluvial soil

Submission file created: submission.csv
