# CEA System - ML Model Training with Visualization

This notebook trains a Random Forest model for the CEA hydroponic system.

## **Rull-based Logic**

The training data uses a **priority system** to prevent conflicting actions:

1. **Priority 1:** Critical water level (< 1.2) → Refill ONLY
2. **Priority 2:** High PPM (> 840) → Dilute (if water < 2.5)
3. **Priority 3:** pH out of range → Adjust pH
4. **Priority 4:** Low PPM (< 560) → Add nutrient
5. **Priority 5:** Micro-adjustments for fine-tuning

This ensures the ML model learns **chemically efficient** actions without conflicts.

In [None]:
# 1. Install Dependencies
!pip install pandas numpy scikit-learn joblib matplotlib seaborn -q

In [None]:
# 2. Upload BOTH CSV Files
from google.colab import files
import os

print("Upload training_telemetry.csv and training_actuator_event.csv")
print("(Select BOTH files when the upload dialog appears)\n")

uploaded = files.upload()

# Detect which file is which
telemetry_file = None
actuator_file = None

for filename in uploaded.keys():
    if 'telemetry' in filename.lower():
        telemetry_file = filename
    elif 'actuator' in filename.lower():
        actuator_file = filename

if not telemetry_file or not actuator_file:
    raise ValueError("Please upload BOTH files: training_telemetry.csv and training_actuator_event.csv")

print(f"✅ Telemetry: {telemetry_file}")
print(f"✅ Actuator:  {actuator_file}")

In [None]:
# 3. Load and Merge Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Loading datasets...")
telemetry_df = pd.read_csv(telemetry_file)
actuator_df = pd.read_csv(actuator_file)

print(f"  Telemetry rows: {len(telemetry_df):,}")
print(f"  Actuator rows:  {len(actuator_df):,}")

# Merge on id, deviceId, ingestTime
df = pd.merge(
    telemetry_df,
    actuator_df,
    on=['deviceId', 'ingestTime'],
    how='inner',
    suffixes=('_telemetry', '_actuator')
)

print(f"\n✅ Merged dataset: {len(df):,} rows")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# 4. Data Analysis & Visualization
print("Dataset Statistics:\n")
print(df.describe())

# Action distribution
total = len(df)
actions = (df['phUp'] > 0) | (df['phDown'] > 0) | (df['nutrientAdd'] > 0) | (df['refill'] > 0)
action_count = actions.sum()

print(f"\n Action Distribution:")
print(f"  Total events: {total:,}")
print(f"  Action events: {action_count:,} ({action_count/total*100:.1f}%)")
print(f"  No action: {total-action_count:,} ({(total-action_count)/total*100:.1f}%)")

# Check for conflicting actions (should be ZERO with priority system)
conflicts = (
    ((df['phUp'] > 0) & (df['refill'] > 0)) |
    ((df['phDown'] > 0) & (df['refill'] > 0)) |
    ((df['nutrientAdd'] > 0) & (df['refill'] > 0)) |
    ((df['phUp'] > 0) & (df['nutrientAdd'] > 0)) |
    ((df['phDown'] > 0) & (df['nutrientAdd'] > 0))
).sum()

print(f"\n Conflicting Actions: {conflicts} ({conflicts/total*100:.2f}%)")
if conflicts == 0:
    print("✅ No conflicts! Priority system working correctly.")
else:
    print(f"WARNING: {conflicts} events have conflicting actions!")

# Visualize action distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
axes[0].pie([action_count, total-action_count], 
           labels=['Action', 'No Action'], 
           autopct='%1.1f%%',
           colors=['#ff6b6b', '#51cf66'])
axes[0].set_title('Action vs No-Action Events', fontsize=14, fontweight='bold')

# Bar chart for each actuator
actuator_counts = {
    'phUp': (df['phUp'] > 0).sum(),
    'phDown': (df['phDown'] > 0).sum(),
    'nutrientAdd': (df['nutrientAdd'] > 0).sum(),
    'refill': (df['refill'] > 0).sum()
}
axes[1].bar(actuator_counts.keys(), actuator_counts.values(), color=['#4dabf7', '#ff8787', '#fcc419', '#51cf66'])
axes[1].set_title('Actuator Activation Frequency', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
features = ['ppm', 'ph', 'tempC', 'humidity', 'waterLevel', 'waterTemp']

for idx, feature in enumerate(features):
    row = idx // 3
    col = idx % 3
    axes[row, col].hist(df[feature], bins=50, color='skyblue', edgecolor='black')
    axes[row, col].set_title(f'{feature} Distribution', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# 5. Prepare Training Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

FEATURES = ['ppm', 'ph', 'tempC', 'humidity', 'waterTemp', 'waterLevel']
TARGETS = ['phUp', 'phDown', 'nutrientAdd', 'refill']

X = df[FEATURES].copy()
y = df[TARGETS].copy()

# Handle missing values
X = X.fillna(method='ffill').fillna(0.0)
y = y.fillna(0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✅ Training set: {len(X_train):,} samples")
print(f"✅ Test set:     {len(X_test):,} samples")

In [None]:
# 6. Train Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import datetime
import json

print("Training Random Forest Model...\n")

# Create model
base_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

model = MultiOutputRegressor(base_model)

# Train
model.fit(X_train_scaled, y_train)

print("\n✅ Training complete!")

In [None]:
# 7. Evaluate Model
print("Evaluating model...\n")

# Predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate metrics for each target
metrics = {}
for i, target in enumerate(TARGETS):
    train_mae = mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i])
    test_mae = mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i])
    train_rmse = np.sqrt(mean_squared_error(y_train.iloc[:, i], y_train_pred[:, i]))
    test_rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_test_pred[:, i]))
    train_r2 = r2_score(y_train.iloc[:, i], y_train_pred[:, i])
    test_r2 = r2_score(y_test.iloc[:, i], y_test_pred[:, i])
    
    metrics[target] = {
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2
    }
    
    print(f"   {target.upper()}:")
    print(f"   Train MAE:  {train_mae:.3f}  |  Test MAE:  {test_mae:.3f}")
    print(f"   Train RMSE: {train_rmse:.3f}  |  Test RMSE: {test_rmse:.3f}")
    print(f"   Train R²:   {train_r2:.3f}  |  Test R²:   {test_r2:.3f}")
    print()

# Overall metrics
overall_test_mae = mean_absolute_error(y_test, y_test_pred)
overall_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"   Overall Test Performance:")
print(f"   MAE:  {overall_test_mae:.3f}")
print(f"   RMSE: {overall_test_rmse:.3f}")

In [None]:
# 8. Visualize Model Performance

# Metrics comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

targets_list = list(TARGETS)
train_maes = [metrics[t]['train_mae'] for t in targets_list]
test_maes = [metrics[t]['test_mae'] for t in targets_list]
train_r2s = [metrics[t]['train_r2'] for t in targets_list]
test_r2s = [metrics[t]['test_r2'] for t in targets_list]

# MAE comparison
x = np.arange(len(targets_list))
width = 0.35
axes[0].bar(x - width/2, train_maes, width, label='Train', color='skyblue')
axes[0].bar(x + width/2, test_maes, width, label='Test', color='salmon')
axes[0].set_ylabel('MAE')
axes[0].set_title('Mean Absolute Error', fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(targets_list, rotation=45)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# R² comparison
axes[1].bar(x - width/2, train_r2s, width, label='Train', color='lightgreen')
axes[1].bar(x + width/2, test_r2s, width, label='Test', color='orange')
axes[1].set_ylabel('R² Score')
axes[1].set_title('R² Score (Higher is Better)', fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(targets_list, rotation=45)
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)
axes[1].axhline(y=0.8, color='red', linestyle='--', alpha=0.5, label='Good (0.8)')

# Feature importance (average across all targets)
importances = np.mean([est.feature_importances_ for est in model.estimators_], axis=0)
axes[2].barh(FEATURES, importances, color='mediumpurple')
axes[2].set_xlabel('Importance')
axes[2].set_title('Feature Importance', fontweight='bold')
axes[2].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

# Prediction vs Actual for each target
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for i, target in enumerate(TARGETS):
    axes[i].scatter(y_test.iloc[:, i], y_test_pred[:, i], alpha=0.3, s=10)
    axes[i].plot([y_test.iloc[:, i].min(), y_test.iloc[:, i].max()], 
                [y_test.iloc[:, i].min(), y_test.iloc[:, i].max()], 
                'r--', lw=2, label='Perfect Prediction')
    axes[i].set_xlabel('Actual')
    axes[i].set_ylabel('Predicted')
    axes[i].set_title(f'{target.upper()} - Predicted vs Actual\nR² = {metrics[target]["test_r2"]:.3f}', 
                     fontweight='bold')
    axes[i].legend()
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 9. Save and Download Model
ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
version = "v" + ts
output_dir = version
os.makedirs(output_dir, exist_ok=True)

# Save model and scaler
joblib.dump(model, os.path.join(output_dir, "model.pkl"))
joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))

# Save metadata
metadata = {
    "version": version,
    "timestamp": ts,
    "training_samples": len(X_train),
    "test_samples": len(X_test),
    "features": FEATURES,
    "targets": TARGETS,
    "metrics": {
        target: {
            "test_mae": float(metrics[target]['test_mae']),
            "test_rmse": float(metrics[target]['test_rmse']),
            "test_r2": float(metrics[target]['test_r2'])
        } for target in TARGETS
    },
    "overall_test_mae": float(overall_test_mae),
    "overall_test_rmse": float(overall_test_rmse),
    "logic_version": "priority_based_v1",
    "conflicting_actions": int(conflicts)
}

with open(os.path.join(output_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

print(f"Model saved to: {output_dir}/")
print(f"\nFiles created:")
print(f"  - model.pkl")
print(f"  - scaler.pkl")
print(f"  - metadata.json")

# Zip and download
!zip -r {version}.zip {version}
print(f"\n Downloading {version}.zip...")
files.download(f"{version}.zip")
print("\n Done! Extract the zip and copy to services/ml/model_registry/")