# IoT Hydroponics - Data Exploration & ML Getting Started

This notebook demonstrates how to:
1. Connect to MongoDB and fetch telemetry data
2. Explore and visualize the data
3. Build a simple growth prediction model
4. Detect anomalies in sensor readings

## 1. Setup & Imports

In [None]:
import sys
sys.path.insert(0, '/app/src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import mean_squared_error, r2_score

# Our custom connectors
from data.mongodb_connector import MongoDBConnector

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
pd.set_option('display.max_columns', None)

print("Imports loaded successfully!")

## 2. Connect to MongoDB & Fetch Data

In [None]:
# Initialize the MongoDB connector
connector = MongoDBConnector()

# Check collection statistics
stats = connector.get_collection_stats()
print("Available data:")
for collection, info in stats.items():
    print(f"  {collection}: {info['count']} records")
    if info['count'] > 0:
        print(f"    From: {info['oldest']} to {info['newest']}")

In [None]:
# Fetch tower telemetry data (last 7 days)
tower_df = connector.get_tower_telemetry(hours=168)

if not tower_df.empty:
    print(f"Tower telemetry: {len(tower_df)} records")
    print(f"Columns: {list(tower_df.columns)}")
    display(tower_df.head())
else:
    print("No tower telemetry data found. Make sure your system is collecting data.")

In [None]:
# Fetch reservoir telemetry data
reservoir_df = connector.get_reservoir_telemetry(hours=168)

if not reservoir_df.empty:
    print(f"Reservoir telemetry: {len(reservoir_df)} records")
    print(f"Columns: {list(reservoir_df.columns)}")
    display(reservoir_df.head())
else:
    print("No reservoir telemetry data found.")

In [None]:
# Fetch height measurements for growth analysis
height_df = connector.get_height_measurements()

if not height_df.empty:
    print(f"Height measurements: {len(height_df)} records")
    display(height_df.head())
else:
    print("No height measurement data found.")

## 3. Data Visualization

In [None]:
# Skip visualization if no data
if tower_df.empty:
    print("No data to visualize. Generate some test data or wait for real telemetry.")
else:
    # Create a time-series plot of environmental conditions
    fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
    
    # Temperature
    if 'air_temp_c' in tower_df.columns:
        axes[0].plot(tower_df['timestamp'], tower_df['air_temp_c'], 'b-', alpha=0.7)
        axes[0].set_ylabel('Temperature (°C)')
        axes[0].set_title('Environmental Conditions Over Time')
        axes[0].axhline(y=25, color='r', linestyle='--', label='Optimal range')
        axes[0].axhline(y=20, color='r', linestyle='--')
    
    # Humidity
    if 'humidity_pct' in tower_df.columns:
        axes[1].plot(tower_df['timestamp'], tower_df['humidity_pct'], 'g-', alpha=0.7)
        axes[1].set_ylabel('Humidity (%)')
    
    # Light
    if 'light_lux' in tower_df.columns:
        axes[2].plot(tower_df['timestamp'], tower_df['light_lux'], 'orange', alpha=0.7)
        axes[2].set_ylabel('Light (lux)')
        axes[2].set_xlabel('Time')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Reservoir water quality visualization
if not reservoir_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 8))
    
    # pH over time
    if 'ph' in reservoir_df.columns:
        axes[0, 0].plot(reservoir_df['timestamp'], reservoir_df['ph'], 'b-')
        axes[0, 0].axhline(y=6.5, color='r', linestyle='--', label='Max optimal')
        axes[0, 0].axhline(y=5.5, color='r', linestyle='--', label='Min optimal')
        axes[0, 0].fill_between(reservoir_df['timestamp'], 5.5, 6.5, alpha=0.2, color='green')
        axes[0, 0].set_ylabel('pH')
        axes[0, 0].set_title('Water pH')
    
    # EC over time
    if 'ec_ms_cm' in reservoir_df.columns:
        axes[0, 1].plot(reservoir_df['timestamp'], reservoir_df['ec_ms_cm'], 'g-')
        axes[0, 1].set_ylabel('EC (mS/cm)')
        axes[0, 1].set_title('Electrical Conductivity')
    
    # Water temperature
    if 'water_temp_c' in reservoir_df.columns:
        axes[1, 0].plot(reservoir_df['timestamp'], reservoir_df['water_temp_c'], 'r-')
        axes[1, 0].set_ylabel('Water Temp (°C)')
        axes[1, 0].set_title('Water Temperature')
    
    # Water level
    if 'water_level_pct' in reservoir_df.columns:
        axes[1, 1].plot(reservoir_df['timestamp'], reservoir_df['water_level_pct'], 'purple')
        axes[1, 1].axhline(y=20, color='r', linestyle='--', label='Low water alert')
        axes[1, 1].set_ylabel('Water Level (%)')
        axes[1, 1].set_title('Reservoir Water Level')
    
    plt.tight_layout()
    plt.show()

## 4. Growth Prediction Model

Build a model to predict plant height based on:
- Days since planting
- Environmental conditions (temp, humidity, light)
- Water quality (pH, EC)

In [None]:
# This is a template - adjust based on your actual data availability

if not height_df.empty and len(height_df) >= 10:
    # Prepare features for growth prediction
    features = ['days_since_planting']
    
    # Filter to rows with required data
    ml_df = height_df.dropna(subset=['height_cm', 'days_since_planting'])
    
    if len(ml_df) >= 10:
        X = ml_df[features].values
        y = ml_df['height_cm'].values
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Train model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        print(f"Growth Prediction Model Results:")
        print(f"  RMSE: {rmse:.2f} cm")
        print(f"  R² Score: {r2:.3f}")
        
        # Plot predictions vs actual
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
        plt.xlabel('Actual Height (cm)')
        plt.ylabel('Predicted Height (cm)')
        plt.title('Growth Prediction: Actual vs Predicted')
        plt.show()
    else:
        print(f"Not enough data for training. Have {len(ml_df)} records, need at least 10.")
else:
    print("No height measurement data available for growth prediction.")
    print("Start recording plant heights to enable this feature!")

## 5. Anomaly Detection

Detect unusual sensor readings that might indicate:
- Sensor malfunction
- Environmental issues
- Equipment failure

In [None]:
def detect_anomalies(df, columns, contamination=0.05):
    """
    Detect anomalies using Isolation Forest.
    
    Args:
        df: DataFrame with sensor data
        columns: List of columns to analyze
        contamination: Expected proportion of anomalies (0.05 = 5%)
    
    Returns:
        DataFrame with anomaly labels (-1 = anomaly, 1 = normal)
    """
    # Filter to available columns
    available_cols = [c for c in columns if c in df.columns]
    
    if not available_cols:
        print("No matching columns found for anomaly detection.")
        return df
    
    # Prepare data
    X = df[available_cols].dropna()
    
    if len(X) < 10:
        print(f"Not enough data for anomaly detection ({len(X)} samples).")
        return df
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Fit Isolation Forest
    iso_forest = IsolationForest(
        contamination=contamination,
        random_state=42,
        n_estimators=100
    )
    
    predictions = iso_forest.fit_predict(X_scaled)
    
    # Add predictions back to original DataFrame
    result = df.copy()
    result.loc[X.index, 'anomaly'] = predictions
    
    anomaly_count = (predictions == -1).sum()
    print(f"Detected {anomaly_count} anomalies ({anomaly_count/len(predictions)*100:.1f}%)")
    
    return result

In [None]:
# Detect anomalies in tower telemetry
if not tower_df.empty:
    sensor_columns = ['air_temp_c', 'humidity_pct', 'light_lux', 'vbat_mv']
    tower_with_anomalies = detect_anomalies(tower_df, sensor_columns)
    
    # Show anomalous readings
    if 'anomaly' in tower_with_anomalies.columns:
        anomalies = tower_with_anomalies[tower_with_anomalies['anomaly'] == -1]
        if not anomalies.empty:
            print("\nAnomalous readings:")
            display(anomalies[['timestamp'] + [c for c in sensor_columns if c in anomalies.columns]].head(10))

In [None]:
# Visualize anomalies
if not tower_df.empty and 'anomaly' in tower_with_anomalies.columns:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    if 'air_temp_c' in tower_with_anomalies.columns:
        # Plot all points
        normal = tower_with_anomalies[tower_with_anomalies['anomaly'] == 1]
        anomaly = tower_with_anomalies[tower_with_anomalies['anomaly'] == -1]
        
        ax.scatter(normal['timestamp'], normal['air_temp_c'], 
                   c='blue', alpha=0.5, label='Normal', s=20)
        ax.scatter(anomaly['timestamp'], anomaly['air_temp_c'], 
                   c='red', alpha=0.8, label='Anomaly', s=50, marker='x')
        
        ax.set_xlabel('Time')
        ax.set_ylabel('Temperature (°C)')
        ax.set_title('Temperature with Anomaly Detection')
        ax.legend()
        plt.show()

## 6. Real-Time Data Streaming (Example)

Example of how to subscribe to MQTT for real-time inference.

In [None]:
# This is a template for real-time processing
# Uncomment and run to test MQTT connection

'''
from data.mqtt_subscriber import MQTTSubscriber, TelemetryMessage

def process_telemetry(msg: TelemetryMessage):
    """Process incoming telemetry in real-time."""
    print(f"Received {msg.message_type} from {msg.farm_id}/{msg.coord_id}")
    print(f"  Payload: {msg.payload}")
    
    # Here you could:
    # - Run anomaly detection
    # - Update growth predictions
    # - Trigger alerts
    # - Store for batch processing

# Create subscriber with callback
subscriber = MQTTSubscriber(buffer_size=100)
subscriber.add_callback(process_telemetry)

# Start in background (non-blocking)
subscriber.start_async()

print("Listening for MQTT messages... (run subscriber.stop() to stop)")
'''

## 7. Next Steps

Ideas for extending the ML capabilities:

1. **Growth Curve Modeling**: Use crop-specific growth curves (sigmoid, logistic) for better predictions

2. **Optimal Condition Recommendation**: Train a model to recommend optimal pH, EC, light schedules

3. **Harvest Date Prediction**: Predict when plants will be ready for harvest

4. **Energy Optimization**: Optimize pump and light schedules to reduce energy usage

5. **Predictive Maintenance**: Detect sensor drift or equipment degradation before failure

6. **Computer Vision**: Add camera integration for visual plant health assessment

In [None]:
# Cleanup
connector.close()
print("Done! MongoDB connection closed.")