# CEA System - Model Training

This notebook trains the Random Forest model for the CEA system using data exported from your local environment.

### Steps:
1. Upload `dataset_pairs.csv`.
2. Run all cells to train the model.
3. Download the trained model files.

In [None]:
# 1. Install Dependencies
!pip install pandas numpy scikit-learn joblib

In [None]:
# 2. Upload Data
from google.colab import files
import os

uploaded = files.upload()
filename = next(iter(uploaded))
print(f"Uploaded: {filename}")

In [None]:
# 3. Preprocessing Logic (from services/ml/preprocessing.py)
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TELEMETRY_FEATURES = ["ppm", "ph", "tempC", "humidity", "waterTemp", "waterLevel"]
TARGETS = ["phUp", "phDown", "nutrientAdd", "refill"]

def prepare_xy(df):
    # Ensure required columns exist
    missing_cols = [c for c in TELEMETRY_FEATURES + TARGETS if c not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in CSV: {missing_cols}")

    X = df[TELEMETRY_FEATURES].copy()
    y = df[TARGETS].copy()
    X = X.fillna(method="ffill").fillna(0.0)
    y = y.fillna(0)
    return X, y

def split_and_scale(X, y, test_size=0.2, random_state=42):
    scaler = StandardScaler()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    scaler.fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_val_s = scaler.transform(X_val)
    return X_train_s, X_val_s, y_train, y_val, scaler

In [None]:
# 4. Training Logic (from services/ml/trainer.py)
import datetime
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

def train_model(csv_path):
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    
    if df.empty:
        raise RuntimeError("Dataset is empty.")

    X, y = prepare_xy(df)
    X_train_s, X_val_s, y_train, y_val, scaler = split_and_scale(X, y)

    print("Training RandomForest...")
    base = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
    model = MultiOutputRegressor(base)
    model.fit(X_train_s, y_train)

    # Evaluation
    y_pred = model.predict(X_val_s)
    maes = mean_absolute_error(y_val, y_pred, multioutput='raw_values')
    rmses = np.sqrt(mean_squared_error(y_val, y_pred, multioutput='raw_values'))

    print("Training complete.")
    print(f"MAE: {maes}")
    print(f"RMSE: {rmses}")
    
    return model, scaler, maes, rmses, list(X.columns), list(y.columns)

In [None]:
# 5. Run Training
model, scaler, maes, rmses, feature_names, target_names = train_model(filename)

In [None]:
# 6. Save and Download Model
ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
version = "v" + ts
output_dir = version
os.makedirs(output_dir, exist_ok=True)

joblib.dump(model, os.path.join(output_dir, "model.pkl"))
joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))

metadata = {
    "version": version,
    "timestamp": ts,
    "mae": maes.tolist(),
    "rmse": rmses.tolist(),
    "features": feature_names,
    "targets": target_names,
}
with open(os.path.join(output_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

# Zip and download
!zip -r {version}.zip {version}
files.download(f"{version}.zip")