In [4]:
import os
import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    MaxPooling2D,
    GlobalAveragePooling2D,
    Dense,
    Flatten,
    Dropout,
)
from tensorflow.keras.optimizers import Adam
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))
from preprocessing import load_metadata, load_images

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../data"))
train_df, test_df = load_metadata(BASE_DIR)
train_images = load_images(train_df)
test_images = load_images(test_df)

# Feature extractor
inp = Input(shape=(128, 128, 3))
x = Conv2D(32, (3, 3), activation="relu")(inp)
x = MaxPooling2D()(x)
x = Conv2D(64, (3, 3), activation="relu")(x)
x = MaxPooling2D()(x)
x = Conv2D(128, (3, 3), activation="relu")(x)
x = GlobalAveragePooling2D()(x)
feature_model = Model(inputs=inp, outputs=x)
train_features = feature_model.predict(train_images, verbose=1)
test_features = feature_model.predict(test_images, verbose=1)

# KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
test_clusters = kmeans.fit_predict(test_features)
soil_centroid = np.mean(train_features, axis=0, keepdims=True)
d0, d1 = (
    cosine_distances([kmeans.cluster_centers_[0]], soil_centroid)[0][0],
    cosine_distances([kmeans.cluster_centers_[1]], soil_centroid)[0][0],
)
soil_cluster = 0 if d0 < d1 else 1
pseudo_labels = np.array([1 if c == soil_cluster else 0 for c in test_clusters])

# CNN Classifier
classifier = Sequential(
    [
        Conv2D(32, (3, 3), activation="relu", input_shape=(128, 128, 3)),
        MaxPooling2D(),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D(),
        Flatten(),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(1, activation="sigmoid"),
    ]
)
classifier.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"])
classifier.fit(test_images, pseudo_labels, epochs=100, batch_size=32, verbose=1)

# Save model
model_path = os.path.join(BASE_DIR, "kmeans_cnn_model.h5")
classifier.save(model_path)
print(f"✅ Model saved at {model_path}")

Loading images: 100%|██████████| 1222/1222 [00:02<00:00, 416.95it/s]
Loading images: 100%|██████████| 967/967 [00:00<00:00, 1983.12it/s]


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step
Epoch 1/100


  ret = a @ b
  ret = a @ b
  ret = a @ b
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.7594 - loss: 0.6201
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.9415 - loss: 0.1381
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.9270 - loss: 0.1806
Epoch 4/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.9764 - loss: 0.0666
Epoch 5/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.9541 - loss: 0.0905
Epoch 6/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.9872 - loss: 0.0384
Epoch 7/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9805 - loss: 0.0708
Epoch 8/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9692 - loss: 0.0761
Epoch 9/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━



✅ Model saved at /Users/sagnikdey/Downloads/FINAL/challenge-2/data/kmeans_cnn_model.h5
