In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay, 
    RocCurveDisplay, 
    roc_auc_score
)
import joblib
import os
from imblearn.over_sampling import SMOTE
from matplotlib.backends.backend_pdf import PdfPages
from tqdm import tqdm

## ------------------------------------------------------------------------------------------------------------------ ##
##                                 DATA PREPARATION                                   ##
## ------------------------------------------------------------------------------------------------------------------ ##

def resample_data(df_raw):

    print("Resampling raw data into hourly, daily, and weekly formats...")
    # --- Ensure correct data types and set a datetime index ---
    df_raw['LocalTimeCol'] = pd.to_datetime(df_raw['LocalTimeCol'])
    df_raw.set_index('LocalTimeCol', inplace=True)

    # --- Resample to HOURLY frequency ⏰ ---
    df_hourly = df_raw.groupby('MeterCode').resample('h').agg({
        'FR': 'mean',  # Use the average flow rate within the hour
        'FV': 'mean'    # Use the average flow volume for the hour
    }).reset_index()

    # --- Resample to DAILY frequency 🗓️ ---
    df_daily = df_raw.groupby('MeterCode').resample('D').agg({
        'FR': 'mean',  # Use the average flow rate across the day
        'FV': 'mean'    # Use the average flow volume for the day
    }).reset_index()
    
    # --- NEW: Resample to WEEKLY frequency 📅 ---
    df_weekly = df_raw.groupby('MeterCode').resample('W').agg({
        'FR': 'mean',  # Use the average flow rate across the week
        'FV': 'mean'    # Use the average flow volume for the week
    }).reset_index()
    
    print("Resampling complete.")
    return df_hourly, df_daily, df_weekly

def prepare_features(df):

    df = df.copy()
    
    # Ensure LocalTimeCol is in datetime format
    df['LocalTimeCol'] = pd.to_datetime(df['LocalTimeCol'])
    
    # Fill NaN values in core columns with 0
    df[["FR", "FV"]] = df[["FR", "FV"]].fillna(0)
    
    # Create target variable 'is_idle' using ONLY FR
    df = df.sort_values(['MeterCode', 'LocalTimeCol']).reset_index(drop=True)
    df['is_idle'] = (df['FR'] <= 0.1).astype(int)

    ## --- Feature Engineering ---
    # Time-based features
    df['hour'] = df['LocalTimeCol'].dt.hour
    df['day_of_week'] = df['LocalTimeCol'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Lagged features for FR and FV (these mean different things for hourly vs daily vs weekly)
    df['FR_lag1'] = df.groupby('MeterCode')['FR'].shift(1)
    df['FV_lag1'] = df.groupby('MeterCode')['FV'].shift(1)
    
    # Rolling window features for FR and FV
    df['FR_roll_mean4'] = df.groupby('MeterCode')['FR'].rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)
    df['FV_roll_std4'] = df.groupby('MeterCode')['FV'].rolling(window=4, min_periods=1).std().reset_index(0, drop=True)
    
    # Fill any remaining NaNs created by new features
    df = df.fillna(0)
    
    return df

## ------------------------------------------------------------------------------------------------------------------ ##
##                               MODEL BUILDING & TRAINING                                ##
## ------------------------------------------------------------------------------------------------------------------ ##

def build_rfc_models(df_features, features, granularity_suffix, min_data_points):
   
    rfc_results = []
    
    model_dir = f"../Models/idle_models/idle_meter_rfc_models{granularity_suffix}"
    os.makedirs(model_dir, exist_ok=True)
    
    unique_meters = df_features['MeterCode'].unique()
    print(f"\nBuilding models for {len(unique_meters)} unique meters ({granularity_suffix.strip('_')})...")
    
    successful_models = 0
    
    progress_bar_desc = f"Processing Meters ({granularity_suffix.strip('_')})"
    for meter_code in tqdm(unique_meters, desc=progress_bar_desc, colour='green',
                           bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'):
        
        meter_data = df_features[df_features['MeterCode'] == meter_code].copy()
        
        if len(meter_data) < min_data_points:
            continue
            
        y = meter_data['is_idle']
        if y.nunique() < 2:
            continue
            
        X = meter_data[features]
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
        except ValueError:
            continue

        min_samples = y_train.value_counts().min()
        if min_samples < 2:
            continue

        k = min(5, min_samples - 1)
        smote = SMOTE(random_state=42, k_neighbors=k)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_smote)
        X_test_scaled = scaler.transform(X_test)
        
        model = RandomForestClassifier(random_state=42, class_weight='balanced')
        model.fit(X_train_scaled, y_train_smote)
        
        y_pred = model.predict(X_test_scaled)
        
        # Define all possible labels for this classification problem
        all_labels = [0, 1]

        rfc_results.append({
            "MeterCode": meter_code, "Model": model, "Scaler": scaler,
            "Accuracy": accuracy_score(y_test, y_pred),
            
            # --- MODIFIED: Add the 'labels' parameter ---
            "Classification_Report": classification_report(y_test, y_pred, labels=all_labels, output_dict=True, zero_division=0),
            
            # --- MODIFIED: Add the 'labels' parameter ---
            "Confusion_Matrix": confusion_matrix(y_test, y_pred, labels=all_labels),
            
            "X_test": X_test_scaled, "y_test": y_test,
        })
        
        meter_model_path = os.path.join(model_dir, f"{meter_code}{granularity_suffix}_model.joblib")
        meter_scaler_path = os.path.join(model_dir, f"{meter_code}{granularity_suffix}_scaler.joblib")
        
        joblib.dump(model, meter_model_path)
        joblib.dump(scaler, meter_scaler_path)
        
        successful_models += 1

    print(f"Successfully trained and saved {successful_models} models.")
    return rfc_results

## ------------------------------------------------------------------------------------------------------------------ ##
##                                        RESULTS & REPORTING                                         ##
## ------------------------------------------------------------------------------------------------------------------ ##

def save_results_to_csv(results_list, model_type_name):

    if not results_list: return
    output_dir = "../Output/idle_prediction/comparison_results"
    os.makedirs(output_dir, exist_ok=True)
    
    summary_data = [{"MeterCode": r["MeterCode"], "Model_Type": model_type_name, "Accuracy": r["Accuracy"],
                     "Classification_Report": str(r["Classification_Report"])} for r in results_list]
                     
    csv_path = os.path.join(output_dir, f"{model_type_name}_results.csv")
    pd.DataFrame(summary_data).to_csv(csv_path, index=False)
    # print(f"CSV results saved to {csv_path}")

## ------------------------------------------------------------------------------------------------------------------ ##
##                                 MAIN EXECUTION                                   ##
## ------------------------------------------------------------------------------------------------------------------ ##

if __name__ == '__main__':
    # --- Load and Resample Data ---
    try:
        df_raw = pd.read_csv("../../Data/Data1.csv")
        df_raw.columns = ['Id', 'LocalTimeCol', 'MeterCode', 'FR', 'FV', 'Today', 'NetTotal', 'created_by', 'creation_date', 'modified_date']
    except FileNotFoundError:
        print("Error: Data file not found. Make sure the path '../../Data/Data1.csv' is correct.")
        exit()
        
    # MODIFIED: Unpack the new weekly dataframe
    df_hourly, df_daily, df_weekly = resample_data(df_raw.copy())
    
    # Define the common feature set
    features = ['FR', 'FV', 'hour', 'day_of_week', 'is_weekend', 'FR_lag1', 'FV_lag1', 'FR_roll_mean4', 'FV_roll_std4']
    
    # --- PIPELINE 1: HOURLY MODELS ---
    print("\n🚀 STARTING PIPELINE 1: HOURLY MODELS 🚀")
    print("=" * 60)
    df_hourly_features = prepare_features(df_hourly)
    results_hourly = build_rfc_models(df_hourly_features, features, granularity_suffix='_hourly', min_data_points=100)
    if results_hourly:
        # save_results_to_pdf(results_hourly, report_name="hourly_model_results.pdf")
        save_results_to_csv(results_hourly, "RandomForest_Hourly")
    print("\n✅ HOURLY PIPELINE COMPLETE ✅\n")

    # --- PIPELINE 2: DAILY MODELS ---
    print("\n🚀 STARTING PIPELINE 2: DAILY MODELS 🚀")
    print("=" * 60)
    df_daily_features = prepare_features(df_daily)
    results_daily = build_rfc_models(df_daily_features, features, granularity_suffix='_daily', min_data_points=20)
    if results_daily:
        # save_results_to_pdf(results_daily, report_name="daily_model_results.pdf")
        save_results_to_csv(results_daily, "RandomForest_Daily")
    print("\n✅ DAILY PIPELINE COMPLETE ✅\n")

    # --- NEW: PIPELINE 3: WEEKLY MODELS ---
    print("\n🚀 STARTING PIPELINE 3: WEEKLY MODELS 🚀")
    print("=" * 60)
    df_weekly_features = prepare_features(df_weekly)
    results_weekly = build_rfc_models(df_weekly_features, features, granularity_suffix='_weekly', min_data_points=5)
    if results_weekly:
        # save_results_to_pdf(results_weekly, report_name="weekly_model_results.pdf")
        save_results_to_csv(results_weekly, "RandomForest_Weekly")
    print("\n✅ WEEKLY PIPELINE COMPLETE ✅\n")

Resampling raw data into hourly, daily, and weekly formats...
Resampling complete.

🚀 STARTING PIPELINE 1: HOURLY MODELS 🚀

Building models for 161 unique meters (hourly)...


Processing Meters (hourly): 100%|[32m██████████[0m| 161/161 [03:02<00:00]


Successfully trained and saved 138 models.

✅ HOURLY PIPELINE COMPLETE ✅


🚀 STARTING PIPELINE 2: DAILY MODELS 🚀

Building models for 161 unique meters (daily)...


Processing Meters (daily): 100%|[32m██████████[0m| 161/161 [01:07<00:00]


Successfully trained and saved 71 models.

✅ DAILY PIPELINE COMPLETE ✅


🚀 STARTING PIPELINE 3: WEEKLY MODELS 🚀

Building models for 161 unique meters (weekly)...


Processing Meters (weekly): 100%|[32m██████████[0m| 161/161 [00:19<00:00]

Successfully trained and saved 20 models.

✅ WEEKLY PIPELINE COMPLETE ✅






In [2]:

## 15min idle state prediction using RandomForestClassifier (target column using Fr only )
## ------------------------------------------------------------------------------------------------------------------ ##



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay, 
    RocCurveDisplay, 
    roc_auc_score
)
import joblib
import os
from imblearn.over_sampling import SMOTE
from matplotlib.backends.backend_pdf import PdfPages
from tqdm import tqdm

def prepare_features_for_analysis(df):
   
    df = df.copy()
    
    # Ensure LocalTimeCol is in datetime format
    df['LocalTimeCol'] = pd.to_datetime(df['LocalTimeCol'])
    
    # Fill NaN values in core columns with 0
    df[["FR", "FV"]] = df[["FR", "FV"]].fillna(0)
    
    # Create target variable 'is_idle' using ONLY FR
    df = df.sort_values(['MeterCode', 'LocalTimeCol']).reset_index(drop=True)
    df['is_idle'] = (df['FR'] <= 0.1).astype(int)

    ## --- Feature Engineering ---
    # Time-based features
    df['hour'] = df['LocalTimeCol'].dt.hour
    df['day_of_week'] = df['LocalTimeCol'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Lagged features for FR and FV (these mean different things for hourly vs daily)
    df['FR_lag1'] = df.groupby('MeterCode')['FR'].shift(1)
    df['FV_lag1'] = df.groupby('MeterCode')['FV'].shift(1)
    
    # Rolling window features for FR and FV
    df['FR_roll_mean4'] = df.groupby('MeterCode')['FR'].rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)
    df['FV_roll_std4'] = df.groupby('MeterCode')['FV'].rolling(window=4, min_periods=1).std().reset_index(0, drop=True)
    
    # Fill any remaining NaNs created by new features
    df = df.fillna(0)
    
    return df


def build_rfc_models(df_features, features):

    rfc_results = []
    
    model_dir = "../Models/idle_models/idle_meter_15min_models"
    os.makedirs(model_dir, exist_ok=True)
    
    unique_meters = df_features['MeterCode'].unique()
    print(f"Building models for {len(unique_meters)} unique meters...")
    
    successful_models = 0
    
    for meter_code in tqdm(unique_meters, desc="Processing Meters", colour='green',
                           bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'):
        
        meter_data = df_features[df_features['MeterCode'] == meter_code].copy()
        
        if len(meter_data) < 100:
            continue
            
        y = meter_data['is_idle']
        if y.nunique() < 2:
            continue
            
        X = meter_data[features]
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
        except ValueError as e:
            continue

        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_smote)
        X_test_scaled = scaler.transform(X_test)
        
        model = RandomForestClassifier(random_state=42, class_weight='balanced')
        model.fit(X_train_scaled, y_train_smote)
        
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
        
        rfc_results.append({
            "Model_name": f"RandomForest_{meter_code}",
            "MeterCode": meter_code,
            "Model": model,
            "Scaler": scaler,
            "Accuracy": accuracy,
            "Classification_Report": report,
            "Confusion_Matrix": cm,
            "X_test": X_test_scaled,
            "y_test": y_test,
            "y_pred": y_pred,
            "Training_samples": len(X_train),
            "Test_samples": len(X_test),
            "Features_used": features
        })
        
        meter_model_path = os.path.join(model_dir, f"{meter_code}_model.joblib")
        meter_scaler_path = os.path.join(model_dir, f"{meter_code}_scaler.joblib")
        
        joblib.dump(model, meter_model_path)
        joblib.dump(scaler, meter_scaler_path)
        
        successful_models += 1

    print(f"\n--- Summary ---")
    print(f"Total unique meters processed: {len(unique_meters)}")
    print(f"Successfully trained and saved models: {successful_models}")
    print(f"Results stored for {len(rfc_results)} models")
    
    return rfc_results





def save_results_to_csv(results_list, model_type_name, output_dir="../Output/idle_prediction/comparison_results"):

    if not results_list:
        print(f"No results to save for {model_type_name}.")
        return

    os.makedirs(output_dir, exist_ok=True)
    
    summary_data = []
    for result in results_list:
        report = result["Classification_Report"]
        summary_data.append({
            "MeterCode": result["MeterCode"],
            "Model_Type": model_type_name,
            "Accuracy": result["Accuracy"],
            "Classification_Report": str(report) 
        })
        
    df_summary = pd.DataFrame(summary_data)
    
    csv_path = os.path.join(output_dir, f"{model_type_name}_results.csv")
    df_summary.to_csv(csv_path, index=False)
    print(f"Comparison results saved to {csv_path}")


if __name__ == '__main__':
    # Load the dataset
    try:
        df = pd.read_csv("../../Data/3mon.csv")
    except FileNotFoundError:
        print("Error: Data file not found. Make sure the path '../../Data/3mon.csv' is correct.")
        exit()
        
    df.columns = ['Id', 'LocalTimeCol', 'MeterCode', 'FR', 'FV', 'Today', 'NetTotal', 'created_by', 'creation_date', 'modified_date']
    
    print("\n🚀 STARTING PIPELINE 2: 15 MIN MODELS 🚀")
    print("=" * 60)
    # Pre-process and prepare features (this now creates the new columns)
    df_features = prepare_features_for_analysis(df)
    
    # Define the common feature set
    features = ['FR', 'FV', 'hour', 'day_of_week', 'is_weekend', 'FR_lag1', 'FV_lag1', 'FR_roll_mean4', 'FV_roll_std4']

    # Build and save individual models for each meter
    print("--- Building Models using random forest classifier for all Meters ---")
    print("-" * 60)
    rfc_results = build_rfc_models(df_features, features)
    
    # Save results
    if rfc_results:
        print("\n--- Saving Reports ---")
        print("-" * 60)
        save_results_to_csv(rfc_results, "RandomForestClassifier")


🚀 STARTING PIPELINE 2: 15 MIN MODELS 🚀
--- Building Models using random forest classifier for all Meters ---
------------------------------------------------------------
Building models for 161 unique meters...


Processing Meters: 100%|[32m██████████[0m| 161/161 [03:31<00:00]


--- Summary ---
Total unique meters processed: 161
Successfully trained and saved models: 138
Results stored for 138 models

--- Saving Reports ---
------------------------------------------------------------
Comparison results saved to ../Output/idle_prediction/comparison_results\RandomForestClassifier_results.csv



