### Agri Sathi — AI Model Training Notebook
### Run this entire notebook on Google Colab (CPU runtime is fine for soil model, use GPU for pest model)
### After training, download the model files and add them to the ml-service/weights/ folder

In [None]:
# CELL 1 — Install dependencies
!pip install tensorflow==2.13.0 scikit-learn xgboost kaggle Pillow numpy pandas matplotlib seaborn

In [None]:
# CELL 2 — Download datasets from Kaggle
# First upload your kaggle.json API key to Colab
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Dataset 1: PlantVillage for pest detection
!kaggle datasets download -d abdallahalidev/plantvillage-dataset
!unzip -q plantvillage-dataset.zip -d plantvillage

# Dataset 2: Crop Recommendation
!kaggle datasets download -d atharvaingle/crop-recommendation-dataset
!unzip -q crop-recommendation-dataset.zip

In [None]:
# CELL 3 — SOIL MODEL (Train this first — it's fast, CPU-friendly, takes ~2 minutes)

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('model\Crop_recommendation.csv')
print(f"Dataset shape: {df.shape}")
print(f"Crops: {df['label'].unique()}")
print(f"Class distribution:\n{df['label'].value_counts()}")

# Features and target
FEATURES = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
TARGET = 'label'

X = df[FEATURES]
y = df[TARGET]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"\nLabel mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")

# Train XGBoost (primary model)
print("\nTraining XGBoost...")
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Train Random Forest (backup model)
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Evaluate both
xgb_pred = xgb_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

print(f"\nXGBoost Accuracy: {accuracy_score(y_test, xgb_pred):.4f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

# Detailed classification report
print("\nXGBoost Classification Report:")
print(classification_report(y_test, xgb_pred, target_names=le.classes_))

# Feature importance
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nFeature Importances:\n{importance_df}")

# Cross validation
cv_scores = cross_val_score(xgb_model, X, y_encoded, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Save model artifacts
soil_artifacts = {
    'model': xgb_model,
    'label_encoder': le,
    'features': FEATURES,
    'accuracy': float(accuracy_score(y_test, xgb_pred)),
    'n_classes': len(le.classes_),
    'class_names': list(le.classes_),
}

with open('soil_model.pkl', 'wb') as f:
    pickle.dump(soil_artifacts, f)

# Save class names as JSON for the API
with open('soil_classes.json', 'w') as f:
    json.dump({
        'classes': list(le.classes_),
        'features': FEATURES,
        'accuracy': float(accuracy_score(y_test, xgb_pred))
    }, f, indent=2)

print("\n✅ Soil model saved as soil_model.pkl")
print(f"✅ soil_classes.json saved with {len(le.classes_)} crop classes")

: 

In [None]:
# CELL 4 — PEST DETECTION MODEL (Requires GPU runtime for reasonable speed)
# Runtime → Change runtime type → GPU before running this cell

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import os

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

# Dataset path — adjust if PlantVillage extracted to different folder
DATASET_PATH = 'plantvillage/plantvillage dataset/color'
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Count classes
classes = [d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))]
NUM_CLASSES = len(classes)
print(f"\nFound {NUM_CLASSES} disease classes")
print(f"Classes: {sorted(classes)[:10]}... (showing first 10)")

# Data generators with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=25,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=False,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest',
    validation_split=0.2
)

train_generator = train_datagen.flow_from_directory(
    DATASET_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True,
    seed=42
)

val_generator = train_datagen.flow_from_directory(
    DATASET_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    seed=42
)

print(f"\nTraining samples: {train_generator.samples}")
print(f"Validation samples: {val_generator.samples}")
print(f"Classes found: {len(train_generator.class_indices)}")

# Save class indices mapping (critical for inference)
class_indices = train_generator.class_indices
idx_to_class = {v: k for k, v in class_indices.items()}

with open('pest_class_labels.json', 'w') as f:
    json.dump({
        'class_indices': class_indices,
        'idx_to_class': idx_to_class,
        'num_classes': NUM_CLASSES
    }, f, indent=2)

print(f"\n✅ pest_class_labels.json saved")

# Build model
print("\nBuilding MobileNetV2 model...")
base_model = MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False  # Freeze base initially

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy', tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top_3_accuracy')]
)

model.summary()

# Callbacks
callbacks = [
    ModelCheckpoint('pest_model_best.h5', monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1),
]

# Phase 1: Train classification head only (frozen base)
print("\n=== PHASE 1: Training classification head (frozen base) ===")
history_phase1 = model.fit(
    train_generator,
    epochs=15,
    validation_data=val_generator,
    callbacks=callbacks,
    verbose=1
)

print(f"\nPhase 1 best validation accuracy: {max(history_phase1.history['val_accuracy']):.4f}")

# Phase 2: Fine-tune top layers of base model
print("\n=== PHASE 2: Fine-tuning top 30 layers of MobileNetV2 ===")
base_model.trainable = True
for layer in base_model.layers[:-30]:
    layer.trainable = False

trainable_count = sum(1 for layer in model.layers if layer.trainable)
print(f"Trainable layers: {trainable_count}")

model.compile(
    optimizer=Adam(learning_rate=1e-5),  # Much lower LR for fine-tuning
    loss='categorical_crossentropy',
    metrics=['accuracy', tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top_3_accuracy')]
)

history_phase2 = model.fit(
    train_generator,
    epochs=10,
    validation_data=val_generator,
    callbacks=callbacks,
    verbose=1
)

# Final evaluation
print("\n=== Final Evaluation ===")
val_loss, val_accuracy, val_top3 = model.evaluate(val_generator, verbose=0)
print(f"Validation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
print(f"Top-3 Accuracy: {val_top3:.4f}")

# Save final model
model.save('pest_model.h5')
print("\n✅ pest_model.h5 saved")

# Save model metadata
metadata = {
    'model_type': 'MobileNetV2',
    'input_size': 224,
    'num_classes': NUM_CLASSES,
    'val_accuracy': float(val_accuracy),
    'val_top3_accuracy': float(val_top3),
    'training_date': pd.Timestamp.now().isoformat(),
}
with open('pest_model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

In [None]:
# CELL 5 — Download all model files
from google.colab import files
files.download('pest_model.h5')          # ~14MB
files.download('soil_model.pkl')          # ~2MB
files.download('pest_class_labels.json')
files.download('soil_classes.json')
files.download('pest_model_metadata.json')

print("\n✅ All model files downloaded. Place them in ml-service/weights/ folder.")