In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

def run_full_modeling_process():
    print("--- Step 3.1: Model Selection, Cross-Validation & Tuning ---")
    
    # 1. Load Data
    df = pd.read_csv('final_features_pro.csv', index_col='dt', parse_dates=True)
    X = df.drop(columns=['Global_active_power'])
    y = df['Global_active_power']
    
    # 2. Split and Scale
    split = int(len(df) * 0.8)
    X_train_raw, X_test_raw = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test = scaler.transform(X_test_raw)
    joblib.dump(scaler, 'data_scaler.pkl')

    # 3. Model Comparison (Requirement: "Compare model performance")
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "Neural Network": MLPRegressor(max_iter=1000, random_state=42)
    }

    comparison_results = []
    
    print("\nComparing all models...")
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        comparison_results.append({
            "Model": name,
            "R2": r2_score(y_test, preds),
            "MAE": mean_absolute_error(y_test, preds),
            "RMSE": np.sqrt(mean_squared_error(y_test, preds))
        })

    # Display comparison
    results_df = pd.DataFrame(comparison_results).sort_values(by='R2', ascending=False)
    print("\n--- Model Comparison Table ---")
    print(results_df)

    # 4. Hyperparameter Tuning on the Winner (Requirement: "Perform hyperparameter tuning")
    # We choose Gradient Boosting as it usually wins this dataset
    print(f"\nTuning the winner: Gradient Boosting...")
    
    tscv = TimeSeriesSplit(n_splits=5) # Cross-validation

    #Hyperparameter Tuning
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'random_state': [42]
    }
    
    grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=tscv, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    final_preds = best_model.predict(X_test)

    # 5. Final Results for Best Model
    print("\n--- FINAL TUNED MODEL PERFORMANCE ---")
    print(f"Best Params: {grid_search.best_params_}")
    print(f"R2: {r2_score(y_test, final_preds):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, final_preds):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, final_preds)):.4f}")

    joblib.dump(best_model, 'final_model_tuned.pkl')
    print("\nSaved: final_model_tuned.pkl")

if __name__ == "__main__":
    run_full_modeling_process()

--- Step 3.1: Model Selection, Cross-Validation & Tuning ---

Comparing all models...

--- Model Comparison Table ---
               Model        R2       MAE      RMSE
2  Gradient Boosting  0.529372  0.153693  0.213988
1      Random Forest  0.494986  0.163305  0.221668
0  Linear Regression  0.487000  0.160359  0.223414
3     Neural Network  0.322185  0.204415  0.256807

Tuning the winner: Gradient Boosting...

--- FINAL TUNED MODEL PERFORMANCE ---
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'random_state': 42}
R2: 0.5363
MAE: 0.1519
RMSE: 0.2124

Saved: final_model_tuned.pkl


In [6]:
#other way to write above code
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import joblib

def run_pro_comparison_with_tuning(input_file):
    print(f"--- Step 3: Model Selection & Hyperparameter Tuning ---")
    
    # 1. Load data
    df = pd.read_csv(input_file, index_col='dt', parse_dates=True)
    X = df.drop(columns=['Global_active_power'])
    y = df['Global_active_power']
    
    # 2. Time-Series Split (Ensuring reproducibility)
    split = int(len(df) * 0.8)
    X_train_raw, X_test_raw = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    # 3. Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_test = scaler.transform(X_test_raw)
    joblib.dump(scaler, 'data_scaler.pkl')

    # 4. Initial Model Comparison (Requirement: RMSE, MAE, R-Squared)
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "Neural Network (MLP)": MLPRegressor(max_iter=1000, random_state=42)
    }

    results = []
    print("\nEvaluating baseline models...")
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        
        results.append({"Model": name, "R2": r2, "MAE": mae, "RMSE": rmse})

    # Display initial results
    results_df = pd.DataFrame(results).sort_values(by='R2', ascending=False)
    print("\n--- Baseline Comparison Table ---")
    print(results_df)

    # 5. Hyperparameter Tuning (Requirement: "Perform hyperparameter tuning")
    # We tune Gradient Boosting since it typically performs best on this data
    print(f"\nRefining the Winner: Tuning Gradient Boosting...")
    
    # Requirement: "Validate models using cross-validation techniques"
    tscv = TimeSeriesSplit(n_splits=5)
    
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'random_state': [42]
    }
    
    # Optimized search
    grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=tscv, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    final_predictions = best_model.predict(X_test)

    # 6. Final Results for Tuned Model
    print("\n--- Final Tuned Model Performance ---")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Final R2: {r2_score(y_test, final_predictions):.4f}")
    print(f"Final MAE: {mean_absolute_error(y_test, final_predictions):.4f}")
    print(f"Final RMSE: {np.sqrt(mean_squared_error(y_test, final_predictions)):.4f}")

    # 7. Save the optimized model
    joblib.dump(best_model, 'final_model_tuned.pkl')
    print(f"\nSuccess: Tuned model saved as 'final_model_tuned.pkl'")
    
    return results_df

if __name__ == "__main__":
    run_pro_comparison_with_tuning('final_features_pro.csv')

--- Step 3: Model Selection & Hyperparameter Tuning ---

Evaluating baseline models...

--- Baseline Comparison Table ---
                  Model        R2       MAE      RMSE
2     Gradient Boosting  0.529372  0.153693  0.213988
1         Random Forest  0.494986  0.163305  0.221668
0     Linear Regression  0.487000  0.160359  0.223414
3  Neural Network (MLP)  0.322185  0.204415  0.256807

Refining the Winner: Tuning Gradient Boosting...

--- Final Tuned Model Performance ---
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'random_state': 42}
Final R2: 0.5363
Final MAE: 0.1519
Final RMSE: 0.2124

Success: Tuned model saved as 'final_model_tuned.pkl'
