In [None]:
import os
import sys
import json
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.models import Model

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))
from preprocessing import load_metadata, load_images

# ✅ Paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../data"))
DOCS_DIR = os.path.abspath(os.path.join(os.getcwd(), "../docs/cards"))
os.makedirs(DOCS_DIR, exist_ok=True)

# ✅ Load metadata and images
train_df, test_df = load_metadata(BASE_DIR)
train_images = load_images(train_df)
test_images = load_images(test_df)

# ✅ Feature extractor (same as training)
inp = Input(shape=(128, 128, 3))
x = Conv2D(32, (3, 3), activation="relu")(inp)
x = MaxPooling2D()(x)
x = Conv2D(64, (3, 3), activation="relu")(x)
x = MaxPooling2D()(x)
x = Conv2D(128, (3, 3), activation="relu")(x)
x = GlobalAveragePooling2D()(x)
feature_model = Model(inputs=inp, outputs=x)
train_features = feature_model.predict(train_images, verbose=1)
test_features = feature_model.predict(test_images, verbose=1)

# ✅ KMeans clustering to generate pseudo-ground-truth
kmeans = KMeans(n_clusters=2, random_state=42)
test_clusters = kmeans.fit_predict(test_features)
soil_centroid = np.mean(train_features, axis=0, keepdims=True)
d0, d1 = (
    cosine_distances([kmeans.cluster_centers_[0]], soil_centroid)[0][0],
    cosine_distances([kmeans.cluster_centers_[1]], soil_centroid)[0][0],
)
soil_cluster = 0 if d0 < d1 else 1
pseudo_labels = np.array([1 if c == soil_cluster else 0 for c in test_clusters])
print(f"\n🔎 Pseudo-ground-truth labels generated.")

# ✅ Load trained model
model_path = os.path.join(BASE_DIR, "kmeans_cnn_model.h5")
model = load_model(model_path)

# ✅ Predict with model
preds = (model.predict(test_images) > 0.5).astype(int).reshape(-1)

# ✅ Compute F1 scores
f1_soil = f1_score(pseudo_labels, preds, pos_label=1)
f1_not_soil = f1_score(pseudo_labels, preds, pos_label=0)
print(
    f"\n✅ F1 Scores (pseudo-ground-truth): soil={f1_soil:.4f}, not_soil={f1_not_soil:.4f}"
)

# ✅ Save metrics
ml_metrics = {
    "_comment": "This JSON file contains the ml-metrics for Challenge-2 (using pseudo-ground-truth)",
    "Name": "Sagnik Dey",
    "Kaggle Username": "sagnikd7",
    "Team Name": "NA (Individual)",
    "f1 scores": {
        "_comment": "Here are the class-wise F1 scores for binary classification (1=Soil, 0=Not Soil)",
        "soil": round(f1_soil, 4),
        "not_soil": round(f1_not_soil, 4),
    },
}

ml_metric_path = os.path.join(DOCS_DIR, "ml-metric.json")
with open(ml_metric_path, "w") as f:
    json.dump(ml_metrics, f, indent=4)
print(f"\n📊 Metrics saved to {ml_metric_path}")

Loading images: 100%|██████████| 1222/1222 [00:02<00:00, 421.59it/s]
Loading images: 100%|██████████| 967/967 [00:00<00:00, 1906.17it/s]


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step

🔎 Pseudo-ground-truth labels generated.


  ret = a @ b
  ret = a @ b
  ret = a @ b


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step

✅ F1 Scores (pseudo-ground-truth): soil=0.9581, not_soil=0.9592

📊 Metrics saved to /Users/sagnikdey/Downloads/FINAL/challenge-2/docs/cards/ml-metric.json
