In [51]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os
import time

In [52]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint, uniform

In [53]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    median_absolute_error,
    mean_absolute_percentage_error,
    explained_variance_score
)

In [54]:


def perform_random_search(X_train, y_train):
    print("\n[STEP] Performing RandomizedSearchCV...")

    rf = RandomForestRegressor(random_state=42, n_jobs=-1)

    param_dist = {
        "n_estimators": randint(20, 40),  # reduced upper bound
        "max_depth": [10, 15, 20],
        "min_samples_split": randint(2, 6),
        "min_samples_leaf": randint(1, 4),
        "max_features": [0.4, 0.5, 0.6],  # avoid uniform, use discrete
        "max_leaf_nodes": [100, 200, None]
    }

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=10,  # lower iterations to avoid long wait
        scoring="neg_mean_absolute_error",
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    print(f"[INFO] Best Parameters: {random_search.best_params_}")
    return random_search.best_estimator_


In [55]:
def evaluation(model, X_test, y_test, model_path, start_time):
    print("\n[STEP] Evaluating model...")
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)

    print(f"Mean Absolute Error (MAE)         : {mae:.4f}")
    print(f"Mean Squared Error (MSE)          : {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE)    : {rmse:.4f}")
    print(f"R² Score                          : {r2:.4f}")
    print(f"Median Absolute Error (MedAE)     : {medae:.4f}")
    print(f"Mean Absolute Percentage Error    : {mape:.4f}")
    print(f"Explained Variance Score (EVS)    : {evs:.4f}")


    print("\n[STEP] Saving model to disk...")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model, model_path, compress=3)

    # Estimate model size
    size_mb = os.path.getsize(model_path) / (1024 * 1024)
    print(f"[INFO] Model saved to: {model_path} ({size_mb:.2f} MB)")

    total_time = time.time() - start_time
    print(f"[INFO] Total training + saving time: {round(total_time, 2)} seconds")

In [56]:
def preprocess_temperature_data(csv_path):
    print(f"\n[INFO] Reading CSV: {csv_path}")
    df = pd.read_csv(csv_path)
    print("Original DataFrame shape:", df.shape)
    
    if df.isnull().sum().sum() > 0:
        print("[DEBUG] Warning: Null values found. Proceeding anyway.")

    # Rename and convert datetime
    df.rename(columns={'temperature_2m (°C)': 'temperature', 'time': 'datetime'}, inplace=True)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    print("[INFO] Converted datetime and sorted. Shape:", df.shape)

    # Lag features
    for i in range(1, 25):
        df[f'temp_t-{i}'] = df['temperature'].shift(i)
    
    df['target'] = df['temperature'].shift(-1)

    print("[DEBUG] After adding lag + target. Shape:", df.shape)
    print("[DEBUG] Preview of columns:", df.columns.tolist())

    # Drop NA from lag creation
    df = df.dropna().reset_index(drop=True)
    print("[INFO] Dropped NaNs. Final usable shape:", df.shape)

    # Define X and y
    feature_cols = [f'temp_t-{i}' for i in range(1, 25)]
    X = df[feature_cols]
    y = df['target']

    print("[DEBUG] Features shape:", X.shape)
    print("[DEBUG] Target shape:", y.shape)
    print("[DEBUG] Sample features:\n", X.head(2))
    print("[DEBUG] Sample target:\n", y.head(2))

    return X, y

In [57]:
# %% Model Training
def train_and_save_model(csv_path, model_path="ml/models/temp_next1hr_model.joblib"):
    start_time = time.time()
    print("\n[STEP] Loading and preprocessing data...")
    X, y = preprocess_temperature_data(csv_path)

    print("\n[STEP] Splitting dataset into train/test...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"[INFO] X_train: {X_train.shape}, X_test: {X_test.shape}")

    print("\n[STEP] Training RandomForest model using RandomizedSearchCV...")
    model = perform_random_search(X_train, y_train)

    
    evaluation(model, X_test, y_test, model_path, start_time)

In [58]:
# ----------- Step 3: Run ------------
# Replace with your actual file path
train_and_save_model("/home/aditya/flask/ml/dataset/open-meteo-18.62N74.00E561m.csv")



[STEP] Loading and preprocessing data...

[INFO] Reading CSV: /home/aditya/flask/ml/dataset/open-meteo-18.62N74.00E561m.csv
Original DataFrame shape: (131184, 10)
[INFO] Converted datetime and sorted. Shape: (131184, 10)
[DEBUG] After adding lag + target. Shape: (131184, 35)
[DEBUG] Preview of columns: ['location_id', 'datetime', 'temperature', 'relative_humidity_2m (%)', 'dew_point_2m (°C)', 'rain (mm)', 'pressure_msl (hPa)', 'surface_pressure (hPa)', 'cloud_cover (%)', 'wind_speed_10m (km/h)', 'temp_t-1', 'temp_t-2', 'temp_t-3', 'temp_t-4', 'temp_t-5', 'temp_t-6', 'temp_t-7', 'temp_t-8', 'temp_t-9', 'temp_t-10', 'temp_t-11', 'temp_t-12', 'temp_t-13', 'temp_t-14', 'temp_t-15', 'temp_t-16', 'temp_t-17', 'temp_t-18', 'temp_t-19', 'temp_t-20', 'temp_t-21', 'temp_t-22', 'temp_t-23', 'temp_t-24', 'target']
[INFO] Dropped NaNs. Final usable shape: (131159, 35)
[DEBUG] Features shape: (131159, 24)
[DEBUG] Target shape: (131159,)
[DEBUG] Sample features:
    temp_t-1  temp_t-2  temp_t-3  tem