In [1]:
#Machine Learning Regression Pipeline for predicting Energy Consumption of a machine based on some features.

In [2]:
#Regression = Predicting continuous numeric values
# Example: Energy Consumed = 250 kWh

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import joblib  
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error, explained_variance_score, max_error, confusion_matrix, classification_report
import mlflow
from mlflow.tracking import MlflowClient
import time
import os
from datetime import datetime
# STEP 1 : Load the dataset
data = pd.read_csv('energy_dataset.csv')

# STEP 2 : Preprocessing the dataset
label_encoder = LabelEncoder()
    #Label Encoding (For Categorical Data → Numbers)
data['Machine'] = label_encoder.fit_transform(data['Machine'])

    # Drop the 'Date' column as it's not useful for prediction
data.drop(columns=['Date'], inplace=True)

    # Split the dataset into features (X) and target (y)
X = data.drop(columns=['Energy Consumed'])
y = data['Energy Consumed']

    # Scaling the features --> To handle different feature ranges
    #Feature   Before Scaling	  After Scaling
    #Temp	   25	              -0.65
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# STEP 3: Train-Test Split
    # Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# STEP 4: Model Training
    # Initialize and train the Random Forest model
#model = RandomForestRegressor(n_estimators=100, random_state=42)
#model.fit(X_train, y_train)

# Initialize MLflow (with error handling)
try:
    mlflow.end_run()  # Ensure no active runs
except:
    pass

mlflow.set_tracking_uri('http://localhost:5000')
model_registry_name = "energy_consumption_model"

# Your original models dictionary
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN': KNeighborsRegressor(),
    'SVR': SVR()
}

results = []
best_mse = float('inf')
best_model = None
best_model_name = None
best_run_id = None

def log_model_run(name, model):
    """Encapsulated model training and logging logic"""
    with mlflow.start_run(run_name=name, nested=True) as run:
        # Training
        start = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start
        
        # Prediction
        start = time.time()
        y_pred = model.predict(X_test)
        pred_time = time.time() - start
        
        # Metrics
        metrics = {
            'MSE': mean_squared_error(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred),
            'MAPE': mean_absolute_percentage_error(y_test, y_pred),
            'Explained Variance': explained_variance_score(y_test, y_pred),
            'Max Error': max_error(y_test, y_pred),
            'Training Time': train_time,
            'Prediction Time': pred_time
        }
        
        # Logging
        mlflow.log_params(model.get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(model, name.lower().replace(" ", "_"))
        
        return metrics, run.info.run_id

# Main execution
with mlflow.start_run(run_name="Model Comparison") as parent_run:
    for name, model in models.items():
        try:
            metrics, run_id = log_model_run(name, model)
            results.append({'Model': name, **metrics})
            
            if metrics['MSE'] < best_mse:
                best_mse = metrics['MSE']
                best_model = model
                best_model_name = name
                best_run_id = run_id
                print(f"New best: {name} (MSE: {best_mse:.4f})")
                
        except Exception as e:
            print(f"Error with {name}: {str(e)}")
            mlflow.end_run()  # Ensure failed run is closed


client = MlflowClient()
model_backup_dir = "backup_models"
os.makedirs(model_backup_dir, exist_ok=True)

# Backup current production model
try:
    # Get the current production model version
    versions = client.get_latest_versions(model_registry_name, stages=["Production"])
    if versions:
        prod_version = versions[0]
        prod_model_uri = f"models:/{model_registry_name}/{prod_version.version}"
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_path = os.path.join(model_backup_dir, f"{model_registry_name}_v{prod_version.version}_{timestamp}")
        os.makedirs(backup_path, exist_ok=True)

        # Download and store model files
        mlflow.artifacts.download_artifacts(prod_model_uri, dst_path=backup_path)
        print(f"📦 Backed up current Production model v{prod_version.version} to {backup_path}")
    else:
        print("ℹ️ No Production model to backup.")
except Exception as e:
    print(f"❌ Failed to backup current Production model: {e}")

    # Register and promote best model
    if best_run_id:
        try:
            model_uri = f"runs:/{best_run_id}/{best_model_name.lower().replace(' ', '_')}"
            mv = mlflow.register_model(model_uri, model_registry_name)
            
            client = MlflowClient()
            client.transition_model_version_stage(
                name=model_registry_name,
                version=mv.version,
                stage="Production",
                archive_existing_versions=True
            )
            print(f"\n🚀 Promoted {best_model_name} v{mv.version} to Production")
        except Exception as e:
            print(f"\n❌ Model promotion failed: {e}")

    # Save comparison
    comparison_df = pd.DataFrame(results).sort_values('MSE')
    comparison_df.to_csv("comparison.csv", index=False)
    mlflow.log_artifact("comparison.csv")
    print("\nModel Comparison:\n", comparison_df)

# Save final artifacts (original functionality)
joblib.dump(best_model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'encoder.pkl')


import plotly.express as px

# MSE Comparison (Interactive)
fig = px.bar(comparison_df.sort_values('MSE'), 
             x='Model', y='MSE',
             title='<b>Mean Squared Error (MSE) Comparison</b>',
             color='MSE', color_continuous_scale='Viridis',
             labels={'MSE': 'MSE (Lower → Better)'},
             template='plotly_dark')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# R² Comparison (Interactive)
fig = px.bar(comparison_df.sort_values('R2', ascending=False), 
             x='Model', y='R2',
             title='<b>R-squared (R²) Comparison</b>',
             color='R2', color_continuous_scale='Plasma',
             labels={'R2': 'R² (Higher → Better)'},
             template='plotly_white')
fig.update_layout(xaxis_tickangle=-45)
fig.show()






best_model = models[comparison_df.iloc[0]['Model']]
y_pred = best_model.predict(X_test)

fig = px.scatter(x=y_test, y=y_pred, 
                 trendline="ols",
                 title=f'<b>{best_model_name}: Actual vs Predicted</b>',
                 labels={'x': 'Actual', 'y': 'Predicted'},
                 template='seaborn')
fig.update_layout(annotations=[
    dict(x=0.1, y=0.9, xref='paper', yref='paper',
         text=f'R² = {r2_score(y_test, y_pred):.3f}', showarrow=False)
])
fig.show()




plt.figure(figsize=(12, 8))
sns.set_style("whitegrid")
heatmap_data = comparison_df.set_index('Model')[['MSE', 'MAE', 'R2']]
heatmap_data = (heatmap_data - heatmap_data.min()) / (heatmap_data.max() - heatmap_data.min())

sns.heatmap(heatmap_data, 
            annot=comparison_df[['MSE', 'MAE', 'R2']], 
            cmap="YlGnBu",
            fmt=".3f",
            linewidths=0.5,
            cbar_kws={'label': 'Normalized Scale'})
plt.title('Model Performance Heatmap', fontsize=14, pad=20)
plt.xticks(rotation=45)
plt.show()



# Static prediction input (change these values to make predictions)
power_rating = 5.83
working_hours = 9
temp = 25.14
humidity = 66
production_output = 491
maintenance = 1
machine_name = 'Sewing Machine'

# Encode the machine name using the same LabelEncoder
machine_encoded = label_encoder.transform([machine_name])[0]

# Prepare the input feature vector with the correct column order
input_features = pd.DataFrame({
    'Power Rating': [power_rating],
    'Working Hours': [working_hours],
    'Temp': [temp],
    'Humidity': [humidity],
    'Production Output': [production_output],
    'Maintenance': [maintenance],
    'Machine': [machine_encoded]
})

# Ensure the input features are in the same order as the original data (before scaling)
input_features = input_features[X.columns]

# Scale the input features using the same scaler used for training
input_features_scaled = scaler.transform(input_features)

# Predict energy consumption using the trained best model
predicted_energy = best_model.predict(input_features_scaled)

print(f"Predicted Energy Consumption: {predicted_energy[0]} kWh")





🏃 View run Linear Regression at: http://localhost:5000/#/experiments/0/runs/27bdd94663c542f692b661fc20da2c34
🧪 View experiment at: http://localhost:5000/#/experiments/0
New best: Linear Regression (MSE: 1098.0764)




🏃 View run Decision Tree at: http://localhost:5000/#/experiments/0/runs/4fcc1ce0ecbd4fe0ba693f173df79e84
🧪 View experiment at: http://localhost:5000/#/experiments/0
New best: Decision Tree (MSE: 25.4435)




🏃 View run Random Forest at: http://localhost:5000/#/experiments/0/runs/09317ededde24c3ea384233875aa0c4a
🧪 View experiment at: http://localhost:5000/#/experiments/0
New best: Random Forest (MSE: 8.2374)




🏃 View run Gradient Boosting at: http://localhost:5000/#/experiments/0/runs/30ec20dd4af847cdac016cfbb87faab3
🧪 View experiment at: http://localhost:5000/#/experiments/0




🏃 View run KNN at: http://localhost:5000/#/experiments/0/runs/ee6e1ac6b7914ea2bd18b02fd0ea65e1
🧪 View experiment at: http://localhost:5000/#/experiments/0




🏃 View run SVR at: http://localhost:5000/#/experiments/0/runs/d29febbc30444cb2b1600c0025b0a841
🧪 View experiment at: http://localhost:5000/#/experiments/0
🏃 View run Model Comparison at: http://localhost:5000/#/experiments/0/runs/ba026e47492a4a9d9a6550f435654b3a
🧪 View experiment at: http://localhost:5000/#/experiments/0


  versions = client.get_latest_versions(model_registry_name, stages=["Production"])
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.76s/it]


📦 Backed up current Production model v15 to backup_models\energy_consumption_model_v15_20250513_112847


NameError: name 'comparison_df' is not defined