# Weather Prediction ML Model Training
**Project:** ESP32-S3 Weather Prediction System  
**Sensors:** BME280 (pressure), AHT10 (temp/humidity), BH1750 (light)  
**Target:** Predict weather conditions (hot humid, cool, rainy, etc.)  
**Course:** COE3012 Computer System Engineering

---

# PHASE 1: PROJECT SETUP & ENVIRONMENT

This phase establishes the foundation for our weather prediction model that will be deployed on ESP32-S3 microcontroller.

## 1.1 Import Required Libraries

In [1]:
# Core Machine Learning Libraries
import ydf  # YDF (Yggdrasil Decision Forests) - Google's next-generation library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Utilities
import os
import sys
import warnings
import datetime
from pathlib import Path
from tqdm import tqdm
import requests
import kaggle

# Scientific Computing
import scipy.stats as stats
from scipy import signal

# Statistical Analysis
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose



# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"‚úÖ YDF Version: {ydf.__version__}")


‚úÖ All libraries imported successfully!
‚úÖ YDF Version: 0.8.0


## 1.2 Configuration Settings

In [3]:
# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Matplotlib settings
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("‚úÖ Configuration settings applied successfully!")
print(f"Random seed set to: {RANDOM_SEED}")
print("‚úÖ YDF configuration is much simpler than TensorFlow Decision Forests!")

‚úÖ Configuration settings applied successfully!
Random seed set to: 42
‚úÖ YDF configuration is much simpler than TensorFlow Decision Forests!


## 1.3 Project Constants

In [4]:
# === SENSOR SPECIFICATIONS ===
# BME280 - Environmental Sensor
BME280_TEMP_MIN = -40.0   # ¬∞C
BME280_TEMP_MAX = 85.0    # ¬∞C
BME280_HUMIDITY_MIN = 0.0 # %
BME280_HUMIDITY_MAX = 100.0 # %
BME280_PRESSURE_MIN = 300.0  # hPa
BME280_PRESSURE_MAX = 1100.0 # hPa

# AHT10 - Temperature & Humidity Sensor
AHT10_TEMP_MIN = -40.0    # ¬∞C
AHT10_TEMP_MAX = 85.0     # ¬∞C
AHT10_HUMIDITY_MIN = 0.0  # %
AHT10_HUMIDITY_MAX = 100.0 # %

# BH1750 - Light Intensity Sensor
BH1750_LIGHT_MIN = 1      # lux
BH1750_LIGHT_MAX = 65535  # lux

# === WEATHER CONDITION CATEGORIES ===
WEATHER_CONDITIONS = [
    'hot_humid',      # High temp, high humidity
    'hot_dry',        # High temp, low humidity  
    'cool',           # Moderate temp, moderate humidity
    'cold',           # Low temp
    'rainy',          # High humidity, moderate light
    'stormy',         # High humidity, low light, variable pressure
    'sunny',          # High light, low humidity
    'cloudy'          # Moderate light, moderate humidity
]

# === YDF MODEL CONFIGURATION ===
YDF_CONFIG = {
    'model_type': 'GradientBoostedTrees',  # Primary choice for weather prediction
    'alternative_models': ['RandomForest', 'CART'],  # Backup options
    'target_label': 'weather_condition',
    'features': ['temperature', 'humidity', 'pressure', 'illuminance'],
    'esp32_deployment': True,
    'performance_targets': {
        'accuracy': 0.85,
        'inference_time_ms': 10,  # Much faster than TF-DF
        'memory_usage_kb': 50     # Much lighter than TF-DF
    }
}

# === FILE PATHS ===
PROJECT_ROOT = Path.cwd()
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
DATA_EXTERNAL = PROJECT_ROOT / 'data' / 'external'
MODELS_TRAINED = PROJECT_ROOT / 'models' / 'trained'
MODELS_CONVERTED = PROJECT_ROOT / 'models' / 'converted'
RESULTS_PLOTS = PROJECT_ROOT / 'results' / 'plots'
RESULTS_REPORTS = PROJECT_ROOT / 'results' / 'reports'

# === MODEL PARAMETERS ===
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.2
# YDF automatically optimizes these parameters
YDF_PARAMS = {
    'num_trees': 100,
    'max_depth': 6,
    'min_examples': 5,
    'use_hessian_gain': True,  # Better for small datasets
    'growing_strategy': 'BEST_FIRST_GLOBAL'  # Optimized for accuracy
}

print("‚úÖ Project constants defined successfully!")
print(f"Weather conditions to predict: {len(WEATHER_CONDITIONS)}")
print(f"YDF model type: {YDF_CONFIG['model_type']}")
print(f"Target accuracy: {YDF_CONFIG['performance_targets']['accuracy']*100}%")
print("‚úÖ YDF configuration optimized for ESP32-S3 deployment")

‚úÖ Project constants defined successfully!
Weather conditions to predict: 8
YDF model type: GradientBoostedTrees
Target accuracy: 85.0%
‚úÖ YDF configuration optimized for ESP32-S3 deployment


## 1.4 Helper Functions Setup

In [5]:
def create_directories():
    """Create necessary directories if they don't exist."""
    directories = [
        DATA_RAW, DATA_PROCESSED, DATA_EXTERNAL,
        MODELS_TRAINED, MODELS_CONVERTED,
        RESULTS_PLOTS, RESULTS_REPORTS
    ]
    
    for directory in directories:
        directory.mkdir(parents=True, exist_ok=True)
    
    print("‚úÖ All directories verified/created")

def verify_ydf_installation():
    """Verify YDF installation and display available models."""
    print(f"YDF Version: {ydf.__version__}")
    print("Available YDF learners:")
    available_models = ydf.get_all_models()
    for model in available_models[:5]:  # Show first 5 models
        print(f"  - {model}")
    if len(available_models) > 5:
        print(f"  ... and {len(available_models) - 5} more models")
    return True

def test_ydf_basic_functionality():
    """Test basic YDF functionality with sample data."""
    try:
        # Create minimal sample data for testing
        sample_data = pd.DataFrame({
            'temperature': [25.5, 30.2, 15.8, 22.1, 28.9],
            'humidity': [60, 80, 45, 55, 75],
            'pressure': [1013, 995, 1020, 1008, 1002],
            'illuminance': [500, 100, 800, 400, 200],
            'weather_condition': ['sunny', 'rainy', 'cool', 'cloudy', 'rainy']
        })
        
        # Test YDF model creation (API test only)
        learner = ydf.RandomForestLearner(label="weather_condition")
        print("‚úÖ YDF RandomForest learner created successfully")
        
        learner = ydf.GradientBoostedTreesLearner(label="weather_condition") 
        print("‚úÖ YDF GradientBoostedTrees learner created successfully")
        
        print("‚úÖ YDF is ready for weather prediction model training")
        return True
    except Exception as e:
        print(f"‚ùå YDF test failed: {e}")
        return False

def validate_sensor_data(df, sensor_type='BME280'):
    """Validate sensor data ranges."""
    validation_rules = {
        'BME280': {
            'temperature': (BME280_TEMP_MIN, BME280_TEMP_MAX),
            'humidity': (BME280_HUMIDITY_MIN, BME280_HUMIDITY_MAX),
            'pressure': (BME280_PRESSURE_MIN, BME280_PRESSURE_MAX)
        },
        'AHT10': {
            'temperature': (AHT10_TEMP_MIN, AHT10_TEMP_MAX),
            'humidity': (AHT10_HUMIDITY_MIN, AHT10_HUMIDITY_MAX)
        },
        'BH1750': {
            'light_intensity': (BH1750_LIGHT_MIN, BH1750_LIGHT_MAX)
        }
    }
    # Implementation placeholder
    return True

def save_ydf_model(model, model_name, export_cpp=True):
    """Save YDF model and optionally export to C++ for ESP32."""
    model_path = MODELS_TRAINED / f"{model_name}.ydf"
    model.save(str(model_path))
    
    if export_cpp:
        # YDF can export models as C++ code for embedded deployment
        cpp_path = MODELS_CONVERTED / f"{model_name}_esp32.cc"
        try:
            # This will be implemented in later phases
            print(f"Model saved: {model_path}")
            print(f"C++ export planned: {cpp_path}")
        except Exception as e:
            print(f"C++ export not yet implemented: {e}")
    
    return model_path

def generate_synthetic_weather_data(num_samples=10000):
    """Generate synthetic weather data for initial testing."""
    # Implementation placeholder - will be detailed in Phase 2
    pass

print("‚úÖ Helper functions defined successfully!")
print("‚úÖ YDF-specific functions added for model training and ESP32 deployment")

‚úÖ Helper functions defined successfully!
‚úÖ YDF-specific functions added for model training and ESP32 deployment


## 1.5 Environment Verification

In [8]:
# Display versions of key packages
print("üîç YDF ENVIRONMENT VERIFICATION")
print("=" * 50)

packages_to_check = [
    ('Python', sys.version.split()[0]),
    ('YDF (Yggdrasil Decision Forests)', ydf.__version__),
    ('NumPy', np.__version__),
    ('Pandas', pd.__version__),
    ('Matplotlib', plt.matplotlib.__version__),
    ('Seaborn', sns.__version__),
    ('Scikit-learn', __import__('sklearn').__version__)
]


for package, version in packages_to_check:
    print(f"‚úÖ {package:<30}: {version}")

print("\nü§ñ YDF MODEL VERIFICATION")
print("=" * 50)
verify_ydf_installation()

print("\nüß™ YDF FUNCTIONALITY TEST")
print("=" * 50)
test_ydf_basic_functionality()

print("\nüñ•Ô∏è  SYSTEM INFORMATION")
print("=" * 50)
print(f"‚úÖ Working Directory: {PROJECT_ROOT}")

print("‚úÖ TensorFlow not required for YDF training")
print(f"‚úÖ Current Time: {datetime.datetime.now()}")

# Verify directory structure
create_directories()

print("\nüéØ YDF PROJECT SETUP STATUS")
print("=" * 50)
print("‚úÖ YDF environment successfully configured!")
print("‚úÖ Advantages over TensorFlow Decision Forests:")
print("   ‚Ä¢ No TensorFlow dependency conflicts")
print("   ‚Ä¢ Faster training and inference (~microsecond predictions)")
print("   ‚Ä¢ Cleaner, simpler API")
print("   ‚Ä¢ Better compatibility across Python versions")
print("   ‚Ä¢ Lighter weight (~500MB less than TF-DF)")
print("‚úÖ Ready for YDF-based weather prediction model training!")
print("‚úÖ Target: ESP32-S3 deployment with C++ code generation")

üîç YDF ENVIRONMENT VERIFICATION
‚úÖ Python                        : 3.10.0
‚úÖ YDF (Yggdrasil Decision Forests): 0.8.0
‚úÖ NumPy                         : 2.2.6
‚úÖ Pandas                        : 2.3.2
‚úÖ Matplotlib                    : 3.10.6
‚úÖ Seaborn                       : 0.13.2
‚úÖ Scikit-learn                  : 1.7.2

ü§ñ YDF MODEL VERIFICATION
YDF Version: 0.8.0
Available YDF learners:


AttributeError: module 'ydf' has no attribute 'get_all_models'