# Combined Experiments

first do the experiment1 ,the combined data, and then split it into 5 folds, then organize them into five pairs of training and testing data

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Load the combined data
combined_data_2024 = pd.read_csv('combined_data_2024.csv')

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=2024)

# Create storage for training and test data splits
folds = []

# Split the data into 5 folds
for train_index, test_index in kf.split(combined_data_2024):
    train_data = combined_data_2024.iloc[train_index]
    test_data = combined_data_2024.iloc[test_index]
    folds.append((train_data, test_data))

# Save each training and test group as separate files
for i, (train_data, test_data) in enumerate(folds):
    train_data.to_csv(f'combined_train_2024_fold{i+1}.csv', index=False)
    test_data.to_csv(f'combined_test_2024_fold{i+1}.csv', index=False)

print("Training and test data for 5 folds with seed 2024 have been generated and saved.")


Training and test data for 5 folds with seed 2024 have been generated and saved.


do the feature selection for each group

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Function to perform feature selection
def feature_selection(train_data, target_column, preselected_features, k=25):
    # Separate features and target
    X = train_data.drop(columns=[target_column, 'TIMESTAMP'], errors='ignore')  # Drop TIMESTAMP
    y = train_data[target_column]

    # Train a RandomForestRegressor model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=100)
    rf_model.fit(X, y)

    # Get feature names and their importance scores
    feature_importances = rf_model.feature_importances_
    feature_names = X.columns

    # Combine feature names and scores into a DataFrame
    selected_features = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Select the preselected features (label, T_out, RH_out)
    preselected_df = selected_features[selected_features['Feature'].isin(preselected_features)]

    # Select the remaining top features after excluding preselected ones
    remaining_features = selected_features[~selected_features['Feature'].isin(preselected_features)]
    top_remaining_features = remaining_features.head(k - len(preselected_features))

    # Combine preselected features with the top remaining features
    final_features = pd.concat([preselected_df, top_remaining_features])

    return final_features

# Perform feature selection for each fold
target_column = 'WH_RTU_Total'
preselected_features = ['label', 'T_out', 'RH_out']

for i in range(1, 6):
    train_data = pd.read_csv(f'combined_train_2024_fold{i}.csv')

    # Perform feature selection
    top_features = feature_selection(train_data, target_column, preselected_features, k=25)

    # Save top features
    top_features.to_csv(f'combined_train_2024_fold{i}_top25_features.csv', index=False)

    print(f"Top 25 features for fold {i} with seed 2024 saved.")


Top 25 features for fold 1 with seed 44 saved.
Top 25 features for fold 2 with seed 44 saved.
Top 25 features for fold 3 with seed 44 saved.
Top 25 features for fold 4 with seed 44 saved.
Top 25 features for fold 5 with seed 44 saved.


then i need to select the related data from both the training and test set in the five groups based on the feature selected in former section(which is the training data)

In [None]:
# Target variable
target_column = 'WH_RTU_Total'

for i in range(1, 6):
    # Load the original training and test datasets for the current fold
    train_data = pd.read_csv(f'combined_train_2024_fold{i}.csv')
    test_data = pd.read_csv(f'combined_test_2024_fold{i}.csv')

    # Load the top 25 features selected from the current fold
    top_features = pd.read_csv(f'combined_train_2024_fold{i}_top25_features.csv')['Feature'].tolist()

    # Ensure the target column is included in the selected features
    train_columns = top_features + [target_column]  # Train data includes target
    test_columns = top_features + [target_column]  # Test data includes target

    # Filter the training data
    filtered_train_data = train_data[train_columns]
    filtered_train_data.to_csv(f'fold_2024_{i}_filtered_train_data.csv', index=False)
    print(f"Filtered train data for fold {i} with seed 2024 saved.")

    # Filter the test data
    filtered_test_data = test_data[test_columns]

    # Split the test data into heating and cooling datasets
    test_data_heating = filtered_test_data[filtered_test_data['label'] == 1]
    test_data_cooling = filtered_test_data[filtered_test_data['label'] == 0]

    # Save all datasets
    filtered_test_data.to_csv(f'fold_2024_{i}_filtered_test_data.csv', index=False)
    test_data_heating.to_csv(f'fold_2024_{i}_filtered_test_data_heating.csv', index=False)
    test_data_cooling.to_csv(f'fold_2024_{i}_filtered_test_data_cooling.csv', index=False)
    print(f"Filtered test data, heating data, and cooling data for fold {i} with seed 2024 saved.")


Filtered train data for fold 1 with seed 2024 saved.
Filtered test data, heating data, and cooling data for fold 1 with seed 2024 saved.
Filtered train data for fold 2 with seed 2024 saved.
Filtered test data, heating data, and cooling data for fold 2 with seed 2024 saved.
Filtered train data for fold 3 with seed 2024 saved.
Filtered test data, heating data, and cooling data for fold 3 with seed 2024 saved.
Filtered train data for fold 4 with seed 2024 saved.
Filtered test data, heating data, and cooling data for fold 4 with seed 2024 saved.
Filtered train data for fold 5 with seed 2024 saved.
Filtered test data, heating data, and cooling data for fold 5 with seed 2024 saved.


generate the mae and R2 value for the four models:linear regression, random forest ,xgboost and stacking ensemble regressor.

In [None]:
!pip install pandas numpy scikit-learn xgboost

from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results = []


# Loop through each fold
for i in range(1, 6):
    # Load filtered training and test datasets for the current fold
    train_data = pd.read_csv(f'fold_2024_{i}_filtered_train_data.csv')
    test_data_combined = pd.read_csv(f'fold_2024_{i}_filtered_test_data.csv')
    test_data_cooling = pd.read_csv(f'fold_2024_{i}_filtered_test_data_cooling.csv')
    test_data_heating = pd.read_csv(f'fold_2024_{i}_filtered_test_data_heating.csv')

    # Separate features and target for training data
    target_column = 'WH_RTU_Total'
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]

    # Define models with default parameters
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=100),
        'XGBoost': XGBRegressor(random_state=100),
        'Stacking Regressor': StackingRegressor(
            estimators=[
                ('rf', RandomForestRegressor(random_state=100)),  # Random Forest
                ('gb', GradientBoostingRegressor(random_state=100))  # Gradient Boosting
            ],
            final_estimator=LinearRegression()  # Linear Regression as the meta-learner
        )
    }

    # Function to evaluate models on a given dataset
    def evaluate_model(model, X_train, y_train, X_test, y_test, dataset_type):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        return mae, r2

    # Evaluate models on combined, cooling, and heating datasets
    for model_name, model in models.items():
        for dataset_type, test_data in [('Combined', test_data_combined),
                                        ('Cooling', test_data_cooling),
                                        ('Heating', test_data_heating)]:
            # Separate features and target for the current test dataset
            X_test = test_data.drop(columns=[target_column])
            y_test = test_data[target_column]

            # Calculate MAE and R²
            mae, r2 = evaluate_model(model, X_train, y_train, X_test, y_test, dataset_type)

            # Store results
            results.append({
                'Fold': i,
                'Seed': 2024,  # Add the seed column
                'Model': model_name,
                'Dataset': dataset_type,
                'MAE': mae,
                'R²': r2,
                'Experiment': 1  # Add the experiment column
            })

            print(f"Fold {i}, Model {model_name}, Dataset {dataset_type}, MAE: {mae:.4f}, R²: {r2:.4f}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv(f'model_results_2024_mae_r2_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for All Models:")
summary = results_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).sort_values(by=['Model', 'Dataset'])
print(summary)


Fold 1, Model Linear Regression, Dataset Combined, MAE: 4.4104, R²: 0.6901
Fold 1, Model Linear Regression, Dataset Cooling, MAE: 4.8851, R²: 0.4392
Fold 1, Model Linear Regression, Dataset Heating, MAE: 3.9591, R²: 0.2674
Fold 1, Model Random Forest, Dataset Combined, MAE: 2.6508, R²: 0.8534
Fold 1, Model Random Forest, Dataset Cooling, MAE: 2.1840, R²: 0.7885
Fold 1, Model Random Forest, Dataset Heating, MAE: 3.0947, R²: 0.5355
Fold 1, Model XGBoost, Dataset Combined, MAE: 2.7121, R²: 0.8492
Fold 1, Model XGBoost, Dataset Cooling, MAE: 2.2229, R²: 0.7898
Fold 1, Model XGBoost, Dataset Heating, MAE: 3.1773, R²: 0.5061
Fold 1, Model Stacking Regressor, Dataset Combined, MAE: 2.8441, R²: 0.8506
Fold 1, Model Stacking Regressor, Dataset Cooling, MAE: 2.4321, R²: 0.7811
Fold 1, Model Stacking Regressor, Dataset Heating, MAE: 3.2358, R²: 0.5337
Fold 2, Model Linear Regression, Dataset Combined, MAE: 4.7560, R²: 0.6580
Fold 2, Model Linear Regression, Dataset Cooling, MAE: 5.3215, R²: 0.433

## Random Forest

### Random Forest Hyperparameter Tuning python



In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_rf = []



# Define hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold_2024_{i}_filtered_train_data.csv')
    test_data_combined = pd.read_csv(f'fold_2024_{i}_filtered_test_data.csv')
    test_data_cooling = pd.read_csv(f'fold_2024_{i}_filtered_test_data_cooling.csv')
    test_data_heating = pd.read_csv(f'fold_2024_{i}_filtered_test_data_heating.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize Random Forest and perform GridSearchCV
    rf = RandomForestRegressor(random_state=100)
    grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_rf = grid_search_rf.best_estimator_
    best_params = grid_search_rf.best_params_

    # Evaluate on Combined, Cooling, and Heating datasets
    for dataset_type, test_data in [('Combined', test_data_combined),
                                    ('Cooling', test_data_cooling),
                                    ('Heating', test_data_heating)]:
        # Split test features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate metrics
        y_pred = best_rf.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results_rf.append({
            'Fold': i,
            'Seed': 2024,  # Add the seed column
            'Model': 'Random Forest',
            'Dataset': dataset_type,
            'MAE': mae,
            'R²': r2,
            'Best Params': best_params,
            'Experiment': 1
        })

        print(f"Fold {i}, Model Random Forest, Dataset {dataset_type}, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_rf_df = pd.DataFrame(results_rf)

# Save detailed results to CSV
results_rf_df.to_csv(f'random_forest_2024_combined_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Random Forest:")
summary_rf = results_rf_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_rf)

# Save summary to CSV 这个section里面有没有inner seed呢？
summary_rf.to_csv(f'random_forest_2024_combined_hyperparameter_results_summary.csv', index=False)


Fold 1, Model Random Forest, Dataset Combined, MAE: 2.8171, R²: 0.8371, Best Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Fold 1, Model Random Forest, Dataset Cooling, MAE: 2.5251, R²: 0.7525, Best Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Fold 1, Model Random Forest, Dataset Heating, MAE: 3.1385, R²: 0.5111, Best Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Fold 2, Model Random Forest, Dataset Combined, MAE: 2.8385, R²: 0.8380, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Fold 2, Model Random Forest, Dataset Cooling, MAE: 2.4956, R²: 0.7214, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Fold 2, Model Random Forest, Dataset Heating, MAE: 3.1849, R²: 0.5400, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Fold 3, Model Random Forest, Dataset Combined, MAE: 2.7391, R²: 0.8477, Best Params: {'max_depth': 10, 'min_samples_spl

### random forest feature importance generation

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Placeholder for feature importance results
feature_importance_results = []

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model for the current fold
    results_rf_df = pd.read_csv(f'random_forest_2024_combined_hyperparameter_results_detailed.csv')
    best_params = results_rf_df.loc[(results_rf_df['Fold'] == fold) & (results_rf_df['Dataset'] == 'Combined'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_rf.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'Random Forest',
        'Experiment': 1 # Add model column
    })
    feature_importance_results.append(importance_df)

# Combine and save feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)
feature_importance_combined.to_csv(f'random_forest_2024_combined_feature_importance.csv', index=False)

print(f"Feature importance for seed 2024 has been successfully saved.")


FileNotFoundError: [Errno 2] No such file or directory: 'Combine_Feature_Importance_Summary.csv'

### generate residual ,predict, label, tout and rh out

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Fixed seed value

merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('random_forest_2024_combined_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'Combined'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train Random Forest model
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Loop through Cooling and Heating datasets
    for dataset_type in ['cooling', 'heating']:
        # Load test data
        test_data = pd.read_csv(f'fold_2024_{fold}_filtered_test_data_{dataset_type}.csv')

        # Split features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate residuals
        y_pred = best_rf.predict(X_test)
        test_data['Predicted'] = y_pred
        test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

        # Select and rename required columns
        if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
            selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'label']].rename(columns={
                'WH_RTU_Total': 'Actual'
            })
        else:
            raise ValueError(f"'T_out' or 'RH_out' is missing in test data for fold {fold} and dataset {dataset_type}")

        # Add fold, seed, and model columns
        selected_data['Fold'] = fold
        selected_data['Seed'] = 2024
        selected_data['Model'] = 'Random Forest'
        selected_data['Experiment'] = 1

        # Append to final merged data
        merged_data.append(selected_data)
        print(f"Processed fold {fold}, dataset {dataset_type}.")

# Combine all folds and datasets into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)
# Save the merged data
merged_final_data.to_csv(f'random_forest_2024_combined_residual.csv', index=False)

print("All folds processed and merged data saved successfully.")


Processed fold 1, dataset cooling.
Processed fold 1, dataset heating.
Processed fold 2, dataset cooling.
Processed fold 2, dataset heating.
Processed fold 3, dataset cooling.
Processed fold 3, dataset heating.
Processed fold 4, dataset cooling.
Processed fold 4, dataset heating.
Processed fold 5, dataset cooling.
Processed fold 5, dataset heating.
All folds processed and merged data saved successfully.


## XGBoost

### XGBoost Hyperparameter Tuning python



In [None]:
!pip install pandas numpy scikit-learn xgboost
!pip install scikit-learn==1.0.2
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_xgb = []



# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold_2024_{i}_filtered_train_data.csv')
    test_data_combined = pd.read_csv(f'fold_2024_{i}_filtered_test_data.csv')
    test_data_cooling = pd.read_csv(f'fold_2024_{i}_filtered_test_data_cooling.csv')
    test_data_heating = pd.read_csv(f'fold_2024_{i}_filtered_test_data_heating.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize XGBoost and perform GridSearchCV
    xgb = XGBRegressor(random_state=100)
    grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_xgb = grid_search_xgb.best_estimator_
    best_params = grid_search_xgb.best_params_

    # Evaluate on Combined, Cooling, and Heating datasets
    for dataset_type, test_data in [('Combined', test_data_combined),
                                    ('Cooling', test_data_cooling),
                                    ('Heating', test_data_heating)]:
        # Split test features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate metrics
        y_pred = best_xgb.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results_xgb.append({
            'Fold': i,
            'Seed': 2024,
            'Model': 'XGBoost',
            'Dataset': dataset_type,
            'MAE': mae,
            'R²': r2,
            'Best Params': best_params,
            'Experiment': 1
        })

        print(f"Fold {i}, Model XGBoost, Dataset {dataset_type}, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_xgb_df = pd.DataFrame(results_xgb)

# Save detailed results to CSV
results_xgb_df.to_csv(f'xgboost_2024_combined_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for XGBoost:")
summary_xgb = results_xgb_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_xgb)

# Save summary to CSV
summary_xgb.to_csv(f'xgboost_2024_combined_hyperparameter_results_summary.csv', index=False)


Fold 1, Model XGBoost, Dataset Combined, MAE: 2.8112, R²: 0.8414, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Fold 1, Model XGBoost, Dataset Cooling, MAE: 2.4370, R²: 0.7708, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Fold 1, Model XGBoost, Dataset Heating, MAE: 3.2230, R²: 0.4889, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Fold 2, Model XGBoost, Dataset Combined, MAE: 3.1139, R²: 0.8392, Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Fold 2, Model XGBoost, Dataset Cooling, MAE: 2.8816, R²: 0.7372, Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Fold 2, Model XGBoost, Dataset Heating, MAE: 3.3486, R²: 0.5100, Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Fold 3, Model XGBoost, Dataset Combined, MAE: 2.7542, R²: 0.85

### feature importance for xgboost

In [None]:
import pandas as pd
from xgboost import XGBRegressor

# Placeholder for feature importance results
feature_importance_results = []



# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_xgb_df = pd.read_csv(f'xgboost_2024_combined_hyperparameter_results_detailed.csv')
    best_params = results_xgb_df.loc[
        (results_xgb_df['Fold'] == fold) & (results_xgb_df['Dataset'] == 'Combined'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Initialize and train the model with the best parameters
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_xgb.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'XGBoost',
        'Experiment': 1  # Add model column
    })
    feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'xgboost_2024_combined_feature_importance.csv', index=False)

print(f"Feature importance for seed 2024 has been successfully saved.")


Feature importance for seed 42 has been successfully saved.


### generate residual，predict，label，tout and rh out

In [None]:
import pandas as pd
from xgboost import XGBRegressor

# Fixed seed value

merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('xgboost_2024_combined_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'Combined'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train XGBoost model
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Loop through Cooling and Heating datasets
    for dataset_type in ['cooling', 'heating']:
        # Load test data
        test_data = pd.read_csv(f'fold_2024_{fold}_filtered_test_data_{dataset_type}.csv')

        # Split features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate residuals
        y_pred = best_xgb.predict(X_test)
        test_data['Predicted'] = y_pred
        test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

        # Select and rename required columns
        if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
            selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'label']].rename(columns={
                'WH_RTU_Total': 'Actual'
            })
        else:
            raise ValueError(f"'T_out' or 'RH_out' is missing in test data for fold {fold} and dataset {dataset_type}")

        # Add fold, seed, and model columns
        selected_data['Fold'] = fold
        selected_data['Seed'] = 2024
        selected_data['Model'] = 'XGBoost'
        selected_data['Experiment'] = 1

        # Append to final merged data
        merged_data.append(selected_data)
        print(f"Processed fold {fold}, dataset {dataset_type}.")

# Combine all folds and datasets into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)
# Save the merged data
merged_final_data.to_csv(f'xgboost_2024_combined_residual.csv', index=False)

print("All folds processed and merged data saved successfully.")


Processed fold 1, dataset cooling.
Processed fold 1, dataset heating.
Processed fold 2, dataset cooling.
Processed fold 2, dataset heating.
Processed fold 3, dataset cooling.
Processed fold 3, dataset heating.
Processed fold 4, dataset cooling.
Processed fold 4, dataset heating.
Processed fold 5, dataset cooling.
Processed fold 5, dataset heating.
All folds processed and merged data saved successfully.


## Stacking Regressor

### Stacking Regressor Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd

# Placeholder for storing results
results_stacking = []


# Define hyperparameter grid for Stacking Regressor
param_grid_stacking = {
    'final_estimator__fit_intercept': [True, False],  # Meta-model parameter (LinearRegression)
    'rf__n_estimators': [50, 100],                   # Base model: Random Forest
    'rf__max_depth': [10, 20],                       # Base model: Random Forest
    'gb__learning_rate': [0.01, 0.1],                # Base model: Gradient Boosting
    'gb__max_depth': [3, 6]                          # Base model: Gradient Boosting
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold_2024_{i}_filtered_train_data.csv')
    test_data_combined = pd.read_csv(f'fold_2024_{i}_filtered_test_data.csv')
    test_data_cooling = pd.read_csv(f'fold_2024_{i}_filtered_test_data_cooling.csv')
    test_data_heating = pd.read_csv(f'fold_2024_{i}_filtered_test_data_heating.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Define base models and Stacking Regressor
    base_models = [
        ('rf', RandomForestRegressor(random_state=100)),
        ('gb', GradientBoostingRegressor(random_state=100))
    ]
    meta_model = LinearRegression()
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Perform GridSearchCV on the Stacking Regressor
    grid_search_stacking = GridSearchCV(stacking, param_grid_stacking, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_stacking.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_stacking = grid_search_stacking.best_estimator_
    best_params = grid_search_stacking.best_params_

    # Evaluate on Combined, Cooling, and Heating datasets
    for dataset_type, test_data in [('Combined', test_data_combined),
                                    ('Cooling', test_data_cooling),
                                    ('Heating', test_data_heating)]:
        # Split test features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate metrics
        y_pred = best_stacking.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results_stacking.append({
            'Fold': i,
            'Seed': 2024,  # Add seed column
            'Model': 'Stacking Regressor',
            'Dataset': dataset_type,
            'MAE': mae,
            'R²': r2,
            'Best Params': best_params,
            'Experiment': 1  # Add model column
        })

        print(f"Fold {i}, Model Stacking Regressor, Dataset {dataset_type}, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_stacking_df = pd.DataFrame(results_stacking)

# Save detailed results to CSV
results_stacking_df.to_csv('stacking_2024_combined_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Stacking Regressor:")
summary_stacking = results_stacking_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_stacking)

# Save summary to CSV
summary_stacking.to_csv('stacking_2024_combined_hyperparameter_results_summary.csv', index=False)




Fold 1, Model Stacking Regressor, Dataset Combined, MAE: 3.1322, R²: 0.8318, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'rf__max_depth': 10, 'rf__n_estimators': 50}
Fold 1, Model Stacking Regressor, Dataset Cooling, MAE: 2.9098, R²: 0.7520, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'rf__max_depth': 10, 'rf__n_estimators': 50}
Fold 1, Model Stacking Regressor, Dataset Heating, MAE: 3.3771, R²: 0.4724, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'rf__max_depth': 10, 'rf__n_estimators': 50}
Fold 2, Model Stacking Regressor, Dataset Combined, MAE: 2.8176, R²: 0.8462, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 20, 'rf__n_estimators': 50}
Fold 2, Model Stacking Regressor, Dataset Cooling, MAE: 2.4591, R²: 0.7469, Best Params: {'final_estimator__fi

### feature importance for combine stacking regressor

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor

# Placeholder for feature importance results
feature_importance_results = []



# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_stacking_df = pd.read_csv(f'stacking_2024_combined_hyperparameter_results_detailed.csv')
    best_params = results_stacking_df.loc[
        (results_stacking_df['Fold'] == fold) & (results_stacking_df['Dataset'] == 'Combined'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Define base models
    base_models = [
        ('rf', RandomForestRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('rf__')})),
        ('gb', GradientBoostingRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('gb__')}))
    ]
    meta_model = LinearRegression(**{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('final_estimator__')})

    # Define Stacking Regressor and fit the model
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)
    stacking.fit(X_train, y_train)

    # Calculate feature importances from base models
    for model_name, model in stacking.named_estimators_.items():
        if hasattr(model, 'feature_importances_'):  # Check if the model supports feature importance
            feature_importances = model.feature_importances_
            feature_names = X_train.columns
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': feature_importances,
                'Model': model_name,
                'Fold': fold,
                'Seed': 2024,
                'Experiment': 1  # Add seed column for reference
            })
            feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'stacking_2024_combined_feature_importance.csv', index=False)

print(f"Feature importance for seed 2024 has been successfully saved.")


Feature importance for seed 42 has been successfully saved.


### generate residual ，predict，label，t_out and RH_OUT

In [None]:
import pandas as pd
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression



merged_data = []  # Placeholder for storing final merged data

# Load the best model parameters for each fold
best_params_df = pd.read_csv('stacking_2024_combined_hyperparameter_results_detailed.csv')

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold_2024_{fold}_filtered_train_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Get the best parameters for the current fold
    best_params = best_params_df.loc[best_params_df['Fold'] == fold, 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Define base models and Stacking Regressor with best parameters
    base_models = [
        ('rf', RandomForestRegressor(random_state=100,
                                     n_estimators=best_params_dict['rf__n_estimators'],
                                     max_depth=best_params_dict['rf__max_depth'])),
        ('gb', GradientBoostingRegressor(random_state=100,
                                         learning_rate=best_params_dict['gb__learning_rate'],
                                         max_depth=best_params_dict['gb__max_depth']))
    ]
    meta_model = LinearRegression(fit_intercept=best_params_dict['final_estimator__fit_intercept'])
    best_stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Train the Stacking Regressor
    best_stacking.fit(X_train, y_train)

    # Loop through Cooling and Heating datasets
    for dataset_type in ['cooling', 'heating']:
        # Load test data
        test_data = pd.read_csv(f'fold_2024_{fold}_filtered_test_data_{dataset_type}.csv')

        # Split features and target
        X_test = test_data.drop(columns=['WH_RTU_Total'])
        y_test = test_data['WH_RTU_Total']

        # Predict and calculate residuals
        y_pred = best_stacking.predict(X_test)
        test_data['Predicted'] = y_pred
        test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

        # Select and rename required columns
        if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
            selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'label']].rename(columns={
                'WH_RTU_Total': 'Actual'
            })
        else:
            raise ValueError(f"'T_out' or 'RH_out' is missing in test data for fold {fold} and dataset {dataset_type}")

        # Add fold, seed, and model columns
        selected_data['Fold'] = fold
        selected_data['Seed'] = 2024
        selected_data['Model'] = 'StackingRegressor'
        selected_data['Experiment'] = 1

        # Append to final merged data
        merged_data.append(selected_data)
        print(f"Processed fold {fold}, dataset {dataset_type}.")

# Combine all folds and datasets into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)
# Save the merged data
merged_final_data.to_csv(f'stacking_2024_combined_residual.csv', index=False)

print("All folds processed and merged data saved successfully.")


Processed fold 1, dataset cooling.
Processed fold 1, dataset heating.
Processed fold 2, dataset cooling.
Processed fold 2, dataset heating.
Processed fold 3, dataset cooling.
Processed fold 3, dataset heating.
Processed fold 4, dataset cooling.
Processed fold 4, dataset heating.
Processed fold 5, dataset cooling.
Processed fold 5, dataset heating.
All folds processed and merged data saved successfully.


# Cooling Experiment

do the same model performance for cooling data, first divide the cooling data into 5 folds and generate differet training and testing groups

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Load the combined data
cooling_data_with_interactions_2024 = pd.read_csv('cooling_data_with_interactions_2024.csv')

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=2024)

# Create storage for training and test data splits
folds = []

# Split the data into 5 folds
for train_index, test_index in kf.split(cooling_data_with_interactions_2024):
    train_data = cooling_data_with_interactions_2024.iloc[train_index]
    test_data = cooling_data_with_interactions_2024.iloc[test_index]
    folds.append((train_data, test_data))

# Save each training and test group as separate files
for i, (train_data, test_data) in enumerate(folds):
    train_data.to_csv(f'cooling_train_2024_fold{i+1}.csv', index=False)
    test_data.to_csv(f'cooling_test_2024_fold{i+1}.csv', index=False)

print("Training and test data for 5 folds with seed 2024 have been generated and saved.")


Training and test data for 5 folds with seed 42 have been generated and saved.


do the feature selection solely based on the training data

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Function to perform feature selection
def feature_selection(train_data, target_column, preselected_features, k=25):
    # Separate features and target
    X = train_data.drop(columns=[target_column, 'TIMESTAMP'], errors='ignore')  # Drop TIMESTAMP
    y = train_data[target_column]

    # Train a RandomForestRegressor model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=100)
    rf_model.fit(X, y)

    # Get feature names and their importance scores
    feature_importances = rf_model.feature_importances_
    feature_names = X.columns

    # Combine feature names and scores into a DataFrame
    selected_features = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Select the preselected features (label, T_out, RH_out)
    preselected_df = selected_features[selected_features['Feature'].isin(preselected_features)]

    # Select the remaining top features after excluding preselected ones
    remaining_features = selected_features[~selected_features['Feature'].isin(preselected_features)]
    top_remaining_features = remaining_features.head(k - len(preselected_features))

    # Combine preselected features with the top remaining features
    final_features = pd.concat([preselected_df, top_remaining_features])

    return final_features

# Perform feature selection for each fold
target_column = 'WH_RTU_Total'
preselected_features = ['label', 'T_out', 'RH_out']

for i in range(1, 6):
    train_data = pd.read_csv(f'cooling_train_2024_fold{i}.csv')

    # Perform feature selection
    top_features = feature_selection(train_data, target_column, preselected_features, k=25)

    # Save top features
    top_features.to_csv(f'cooling_train_2024_fold{i}_top25_features.csv', index=False)

    print(f"Top 25 features for cooling fold {i} with seed 2024 saved.")


Top 25 features for cooling fold 1 with seed 42 saved.
Top 25 features for cooling fold 2 with seed 42 saved.
Top 25 features for cooling fold 3 with seed 42 saved.
Top 25 features for cooling fold 4 with seed 42 saved.
Top 25 features for cooling fold 5 with seed 42 saved.


prepare the cooling dataset's training and test data

In [None]:
import pandas as pd

# Target variable
target_column = 'WH_RTU_Total'

for i in range(1, 6):
    # Load the original training and test datasets for the current fold
    train_data = pd.read_csv(f'cooling_train_2024_fold{i}.csv')
    test_data = pd.read_csv(f'cooling_test_2024_fold{i}.csv')

    # Load the top 25 features selected from the current fold
    top_features = pd.read_csv(f'cooling_train_2024_fold{i}_top25_features.csv')['Feature'].tolist()

    # Ensure the target column is included in the selected features
    filtered_columns = top_features + [target_column]  # Train data does not include 'label'

    # Filter the training data
    filtered_train_data = train_data[filtered_columns]
    filtered_train_data.to_csv(f'fold{i}_cooling_filtered_train_2024_data.csv', index=False)
    print(f"Filtered cooling train data for fold {i} saved.")

    # Filter the test data (temporarily including 'label')
    filtered_test_data = test_data[filtered_columns]

    # Save all datasets
    filtered_test_data.to_csv(f'fold{i}_cooling_filtered_test_2024_data.csv', index=False)
    print(f"Filtered cooling test data for fold {i} saved.")



Filtered cooling train data for fold 1 saved.
Filtered cooling test data for fold 1 saved.
Filtered cooling train data for fold 2 saved.
Filtered cooling test data for fold 2 saved.
Filtered cooling train data for fold 3 saved.
Filtered cooling test data for fold 3 saved.
Filtered cooling train data for fold 4 saved.
Filtered cooling test data for fold 4 saved.
Filtered cooling train data for fold 5 saved.
Filtered cooling test data for fold 5 saved.


generate the mae and R2 value for the four models:linear regression, random forest ,xgboost and stacking ensemble regressor.

In [None]:
!pip install pandas numpy scikit-learn xgboost
!pip install scikit-learn==1.0.2
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
# Placeholder for storing results
results = []

# Loop through each fold
for i in range(1, 6):
    # Load filtered training and test datasets for cooling data
    train_data = pd.read_csv(f'fold{i}_cooling_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_cooling_filtered_test_2024_data.csv')

    # Separate features and target for training and testing data
    target_column = 'WH_RTU_Total'
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]

    # Define models with default parameters
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=100),
        'XGBoost': XGBRegressor(random_state=100),
        'Stacking Regressor': StackingRegressor(
            estimators=[
                ('lr', LinearRegression()),
                ('rf', RandomForestRegressor(random_state=100)),
                ('xgb', XGBRegressor(random_state=100))
            ]
        )
    }

    # Train and evaluate each model
    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)

        # Predict on test data
        y_pred = model.predict(X_test)

        # Calculate MAE and R²
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results.append({
            'Fold': i,
            'Seed': 2024,
            'Model': model_name,
            'Dataset': 'Cooling',
            'MAE': mae,
            'R²': r2,
            'Experiment':2
        })

        print(f"Fold {i}, Model {model_name}, MAE: {mae:.4f}, R²: {r2:.4f}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('cooling_data_results_mae_r2.csv', index=False)

# Display overall summary
print("\nSummary Results for All Models (Cooling Data):")
summary = results_df.groupby('Model').agg({'MAE': 'mean', 'R²': 'mean'}).sort_values(by='MAE')
print(summary)


Fold 1, Model Linear Regression, MAE: 5.1285, R²: 0.4158
Fold 1, Model Random Forest, MAE: 2.8695, R²: 0.6677
Fold 1, Model XGBoost, MAE: 2.7304, R²: 0.7010
Fold 1, Model Stacking Regressor, MAE: 2.7941, R²: 0.7029
Fold 2, Model Linear Regression, MAE: 4.6839, R²: 0.4979
Fold 2, Model Random Forest, MAE: 2.2330, R²: 0.7723
Fold 2, Model XGBoost, MAE: 2.3624, R²: 0.7604
Fold 2, Model Stacking Regressor, MAE: 2.2203, R²: 0.7794
Fold 3, Model Linear Regression, MAE: 4.7205, R²: 0.5362
Fold 3, Model Random Forest, MAE: 2.1603, R²: 0.8138
Fold 3, Model XGBoost, MAE: 2.1716, R²: 0.8179
Fold 3, Model Stacking Regressor, MAE: 2.1324, R²: 0.8268
Fold 4, Model Linear Regression, MAE: 4.7932, R²: 0.4408
Fold 4, Model Random Forest, MAE: 2.2995, R²: 0.7602
Fold 4, Model XGBoost, MAE: 2.3938, R²: 0.7565
Fold 4, Model Stacking Regressor, MAE: 2.2701, R²: 0.7700
Fold 5, Model Linear Regression, MAE: 4.7752, R²: 0.4710
Fold 5, Model Random Forest, MAE: 2.1781, R²: 0.8128
Fold 5, Model XGBoost, MAE: 2.

## Random Forest

### hyper parameter tuning for cooling random forest

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_rf = []

# Define hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_cooling_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_cooling_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize Random Forest and perform GridSearchCV
    rf = RandomForestRegressor(random_state=100)
    grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_rf = grid_search_rf.best_estimator_
    best_params = grid_search_rf.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_rf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_rf.append({
        'Fold': i,
        'Seed':2024,
        'Model': 'Random Forest',
        'Dataset': 'Cooling',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment': 2
    })

    print(f"Fold {i}, Model Random Forest, Dataset Cooling, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_rf_df = pd.DataFrame(results_rf)

# Save detailed results to CSV
results_rf_df.to_csv('random_forest_2024_cooling_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Random Forest:")
summary_rf = results_rf_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_rf)

# Save summary to CSV
summary_rf.to_csv('random_forest_2024_cooling_summary_results.csv', index=False)


Fold 1, Model Random Forest, Dataset Cooling, MAE: 2.8370, R²: 0.6721, Best Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Fold 2, Model Random Forest, Dataset Cooling, MAE: 2.2280, R²: 0.7719, Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Fold 3, Model Random Forest, Dataset Cooling, MAE: 2.1651, R²: 0.8123, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
Fold 4, Model Random Forest, Dataset Cooling, MAE: 2.3034, R²: 0.7583, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
Fold 5, Model Random Forest, Dataset Cooling, MAE: 2.1723, R²: 0.8089, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}

Summary Results for Random Forest:
           Model  Dataset      MAE        R²
0  Random Forest  Cooling  2.34115  0.764705


### Feature Importance for Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Placeholder for feature importance results
feature_importance_results = []

# Fixed seed value

experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    results_rf_df = pd.read_csv('random_forest_2024_cooling_hyperparameter_results_detailed.csv')
    best_params = results_rf_df.loc[(results_rf_df['Fold'] == fold) & (results_rf_df['Dataset'] == 'Cooling'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train Random Forest model with best parameters
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_rf.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'Random Forest',
        'Experiment': experiment  # Add experiment column
    })
    feature_importance_results.append(importance_df)

# Combine and save feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)
feature_importance_combined.to_csv(f'random_forest_2024_cooling_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 42 and experiment 2 has been successfully saved.


### residual，predict，label，T_OUT and RH_OUT

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('random_forest_2024_cooling_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'Cooling'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train Random Forest model with best parameters
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_cooling_filtered_test_2024_data.csv')
    test_data['Label'] = 'Cooling'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_rf.predict(X_test)
    test_data['Predicted'] = y_pred
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'Random Forest'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'random_forest_2024_cooling_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to random_forest_42_cooling_residual.csv.


## XGBOOST

### hyper parameter tuning for cooling XGBoost

In [None]:
!pip install pandas numpy scikit-learn xgboost
!pip install scikit-learn==1.0.2
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_xgb = []
experiment = 2  # Experiment number
# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_cooling_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_cooling_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize XGBoost and perform GridSearchCV
    xgb = XGBRegressor(random_state=100)
    grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_xgb = grid_search_xgb.best_estimator_
    best_params = grid_search_xgb.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_xgb.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_xgb.append({
        'Fold': i,
        'Seed': 2024,
        'Model': 'XGBoost',
        'Dataset': 'Cooling',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment' : experiment
    })


    print(f"Fold {i}, Model XGBoost, Dataset Cooling, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_xgb_df = pd.DataFrame(results_xgb)

# Save detailed results to CSV
results_xgb_df.to_csv('xgboost_2024_cooling_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for XGBoost:")
summary_xgb = results_xgb_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_xgb)

# Save summary to CSV
summary_xgb.to_csv('xgboost_2024_cooling_summary_results.csv', index=False)


Fold 1, Model XGBoost, Dataset Cooling, MAE: 2.6462, R²: 0.6911, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 2, Model XGBoost, Dataset Cooling, MAE: 2.1649, R²: 0.7814, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'subsample': 0.8}
Fold 3, Model XGBoost, Dataset Cooling, MAE: 1.9818, R²: 0.8410, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 4, Model XGBoost, Dataset Cooling, MAE: 2.2875, R²: 0.7630, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 5, Model XGBoost, Dataset Cooling, MAE: 2.1282, R²: 0.8167, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'subsample': 0.8}

Summary Results for XGBoost:
     Model  Dataset       MAE        R²
0  XGBoost  Cooling  2.241726  0.778636


### Feature Importance

In [None]:
import pandas as pd
from xgboost import XGBRegressor

# Placeholder for feature importance results
feature_importance_results = []


experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_xgb_df = pd.read_csv('xgboost_2024_cooling_hyperparameter_results_detailed.csv')
    best_params = results_xgb_df.loc[
        (results_xgb_df['Fold'] == fold) & (results_xgb_df['Dataset'] == 'Cooling'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Initialize and train the model with the best parameters
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_xgb.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'XGBoost',
        'Experiment': experiment  # Add experiment column
    })
    feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'xgboost_2024_cooling_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 42 and experiment 2 has been successfully saved.


### Residual analysis

In [None]:
import pandas as pd
from xgboost import XGBRegressor


experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('xgboost_2024_cooling_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'Cooling'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train XGBoost model with best parameters
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_cooling_filtered_test_2024_data.csv')
    test_data['Label'] = 'Cooling'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_xgb.predict(X_test)
    test_data['Predicted'] = y_pred
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'XGBoost'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'xgboost_2024_cooling_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to xgboost_42_cooling_residual.csv.


## Stacking Regressor

### hyper parameter tuning for cooling Stacking Regressor

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_stacking = []

experiment = 2  # Experiment number
# Define hyperparameter grid for Stacking Regressor
param_grid_stacking = {
    'final_estimator__fit_intercept': [True, False],  # Meta-model parameter (LinearRegression)
    'rf__n_estimators': [50, 100],                   # Base model: Random Forest
    'rf__max_depth': [10, 20],                       # Base model: Random Forest
    'gb__learning_rate': [0.01, 0.1],                # Base model: Gradient Boosting
    'gb__max_depth': [3, 6]                          # Base model: Gradient Boosting
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_cooling_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_cooling_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Define base models and Stacking Regressor
    base_models = [
        ('rf', RandomForestRegressor(random_state=100)),
        ('gb', GradientBoostingRegressor(random_state=100))
    ]
    meta_model = LinearRegression()
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Perform GridSearchCV on the Stacking Regressor
    grid_search_stacking = GridSearchCV(stacking, param_grid_stacking, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_stacking.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_stacking = grid_search_stacking.best_estimator_
    best_params = grid_search_stacking.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_stacking.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_stacking.append({
        'Fold': i,
        'Seed': 2024,
        'Model': 'Stacking Regressor',
        'Dataset': 'Cooling',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment': experiment
    })

    print(f"Fold {i}, Model Stacking Regressor, Dataset Cooling, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_stacking_df = pd.DataFrame(results_stacking)

# Save detailed results to CSV
results_stacking_df.to_csv('stacking_2024_cooling_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Stacking Regressor:")
summary_stacking = results_stacking_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_stacking)

# Save summary to CSV
summary_stacking.to_csv('stacking_2024_cooling_summary_results.csv', index=False)


Fold 1, Model Stacking Regressor, Dataset Cooling, MAE: 2.7825, R²: 0.6955, Best Params: {'final_estimator__fit_intercept': True, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 20, 'rf__n_estimators': 100}
Fold 2, Model Stacking Regressor, Dataset Cooling, MAE: 2.2096, R²: 0.7816, Best Params: {'final_estimator__fit_intercept': True, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 10, 'rf__n_estimators': 100}
Fold 3, Model Stacking Regressor, Dataset Cooling, MAE: 2.1008, R²: 0.8251, Best Params: {'final_estimator__fit_intercept': True, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 10, 'rf__n_estimators': 100}
Fold 4, Model Stacking Regressor, Dataset Cooling, MAE: 2.2537, R²: 0.7683, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 10, 'rf__n_estimators': 100}
Fold 5, Model Stacking Regressor, Dataset Cooling, MAE: 2.1195, R²: 0.8146, Best Params: {'final_estimator__fit

### Feature Importance

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor

# Placeholder for feature importance results
feature_importance_results = []


experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_stacking_df = pd.read_csv('stacking_2024_cooling_hyperparameter_results_detailed.csv')
    best_params = results_stacking_df.loc[
        (results_stacking_df['Fold'] == fold) & (results_stacking_df['Dataset'] == 'Cooling'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Define base models
    base_models = [
        ('rf', RandomForestRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('rf__')})),
        ('gb', GradientBoostingRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('gb__')}))
    ]
    meta_model = LinearRegression(**{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('final_estimator__')})

    # Define Stacking Regressor and fit the model
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)
    stacking.fit(X_train, y_train)

    # Calculate feature importances from base models
    for model_name, model in stacking.named_estimators_.items():
        if hasattr(model, 'feature_importances_'):  # Check if the model supports feature importance
            feature_importances = model.feature_importances_
            feature_names = X_train.columns
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': feature_importances,
                'Model': model_name,
                'Fold': fold,
                'Seed': 2024,
                'Experiment': experiment  # Add experiment column
            })
            feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'stacking_2024_cooling_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 42 and experiment 2 has been successfully saved.


### Residual Analysis

In [None]:
import pandas as pd
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression


experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Load the best model parameters for each fold
best_params_df = pd.read_csv('stacking_2024_cooling_hyperparameter_results_detailed.csv')

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_cooling_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Get the best parameters for the current fold
    best_params = best_params_df.loc[best_params_df['Fold'] == fold, 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Define base models and Stacking Regressor with best parameters
    base_models = [
        ('rf', RandomForestRegressor(random_state=100,
                                     n_estimators=best_params_dict['rf__n_estimators'],
                                     max_depth=best_params_dict['rf__max_depth'])),
        ('gb', GradientBoostingRegressor(random_state=100,
                                         learning_rate=best_params_dict['gb__learning_rate'],
                                         max_depth=best_params_dict['gb__max_depth']))
    ]
    meta_model = LinearRegression(fit_intercept=best_params_dict['final_estimator__fit_intercept'])
    best_stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Train the Stacking Regressor
    best_stacking.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_cooling_filtered_test_2024_data.csv')
    test_data['Label'] = 'Cooling'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_stacking.predict(X_test)
    test_data = test_data.reset_index(drop=True)  # Reset test_data index
    y_pred_series = pd.Series(y_pred, index=test_data.index)  # Align y_pred with test_data index
    test_data['Predicted'] = y_pred_series
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'StackingRegressor'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'stacking_2024_cooling_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to stacking_42_cooling_residual.csv.


# heating Experiment

do the same model performance for heating data, first divide the heating data into 5 folds and generate differet training and testing groups

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Load the combined data
heating_data_with_interactions_2024 = pd.read_csv('heating_data_with_interactions_2024.csv')

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=2024)

# Create storage for training and test data splits
folds = []

# Split the data into 5 folds
for train_index, test_index in kf.split(heating_data_with_interactions_2024):
    train_data = heating_data_with_interactions_2024.iloc[train_index]
    test_data = heating_data_with_interactions_2024.iloc[test_index]
    folds.append((train_data, test_data))

# Save each training and test group as separate files
for i, (train_data, test_data) in enumerate(folds):
    train_data.to_csv(f'heating_train_2024_fold{i+1}.csv', index=False)
    test_data.to_csv(f'heating_test_2024_fold{i+1}.csv', index=False)

print("Training and test data for 5 folds with seed 2024 have been generated and saved.")


Training and test data for 5 folds with seed 2024 have been generated and saved.


do the feature selection solely based on the training data

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Function to perform feature selection
def feature_selection(train_data, target_column, preselected_features, k=25):
    # Separate features and target
    X = train_data.drop(columns=[target_column, 'TIMESTAMP'], errors='ignore')  # Drop TIMESTAMP
    y = train_data[target_column]

    # Train a RandomForestRegressor model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=100)
    rf_model.fit(X, y)

    # Get feature names and their importance scores
    feature_importances = rf_model.feature_importances_
    feature_names = X.columns

    # Combine feature names and scores into a DataFrame
    selected_features = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    # Select the preselected features (label, T_out, RH_out)
    preselected_df = selected_features[selected_features['Feature'].isin(preselected_features)]

    # Select the remaining top features after excluding preselected ones
    remaining_features = selected_features[~selected_features['Feature'].isin(preselected_features)]
    top_remaining_features = remaining_features.head(k - len(preselected_features))

    # Combine preselected features with the top remaining features
    final_features = pd.concat([preselected_df, top_remaining_features])

    return final_features

# Perform feature selection for each fold
target_column = 'WH_RTU_Total'
preselected_features = ['label', 'T_out', 'RH_out']

for i in range(1, 6):
    train_data = pd.read_csv(f'heating_train_2024_fold{i}.csv')

    # Perform feature selection
    top_features = feature_selection(train_data, target_column, preselected_features, k=25)

    # Save top features
    top_features.to_csv(f'heating_train_2024_fold{i}_top25_features.csv', index=False)

    print(f"Top 25 features for heating fold {i} with seed 2024 saved.")


Top 25 features for cooling fold 1 with seed 42 saved.
Top 25 features for cooling fold 2 with seed 42 saved.
Top 25 features for cooling fold 3 with seed 42 saved.
Top 25 features for cooling fold 4 with seed 42 saved.
Top 25 features for cooling fold 5 with seed 42 saved.


prepare the heating dataset's training and test data

In [None]:
import pandas as pd

# Target variable
target_column = 'WH_RTU_Total'

for i in range(1, 6):
    # Load the original training and test datasets for the current fold
    train_data = pd.read_csv(f'heating_train_2024_fold{i}.csv')
    test_data = pd.read_csv(f'heating_test_2024_fold{i}.csv')

    # Load the top 25 features selected from the current fold
    top_features = pd.read_csv(f'heating_train_2024_fold{i}_top25_features.csv')['Feature'].tolist()

    # Ensure the target column is included in the selected features
    filtered_columns = top_features + [target_column]  # Train data does not include 'label'

    # Filter the training data
    filtered_train_data = train_data[filtered_columns]
    filtered_train_data.to_csv(f'fold{i}_heating_filtered_train_2024_data.csv', index=False)
    print(f"Filtered heating train data for fold {i} saved.")

    # Filter the test data (temporarily including 'label')
    filtered_test_data = test_data[filtered_columns]

    # Save all datasets
    filtered_test_data.to_csv(f'fold{i}_heating_filtered_test_2024_data.csv', index=False)
    print(f"Filtered heating test data for fold {i} saved.")



Filtered heating train data for fold 1 saved.
Filtered heating test data for fold 1 saved.
Filtered heating train data for fold 2 saved.
Filtered heating test data for fold 2 saved.
Filtered heating train data for fold 3 saved.
Filtered heating test data for fold 3 saved.
Filtered heating train data for fold 4 saved.
Filtered heating test data for fold 4 saved.
Filtered heating train data for fold 5 saved.
Filtered heating test data for fold 5 saved.


generate the mae and R2 value for the four models:linear regression, random forest ,xgboost and stacking ensemble regressor.

In [None]:
!pip install pandas numpy scikit-learn xgboost
!pip install scikit-learn==1.0.2
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
# Placeholder for storing results
results = []

# Loop through each fold
for i in range(1, 6):
    # Load filtered training and test datasets for heating data
    train_data = pd.read_csv(f'fold{i}_heating_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_heating_filtered_test_2024_data.csv')

    # Separate features and target for training and testing data
    target_column = 'WH_RTU_Total'
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]

    # Define models with default parameters
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=100),
        'XGBoost': XGBRegressor(random_state=100),
        'Stacking Regressor': StackingRegressor(
            estimators=[
                ('lr', LinearRegression()),
                ('rf', RandomForestRegressor(random_state=100)),
                ('xgb', XGBRegressor(random_state=100))
            ]
        )
    }

    # Train and evaluate each model
    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)

        # Predict on test data
        y_pred = model.predict(X_test)

        # Calculate MAE and R²
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results.append({
            'Fold': i,
            'Seed': 2024,
            'Model': model_name,
            'Dataset': 'heating',
            'MAE': mae,
            'R²': r2,
            'Experiment':2
        })

        print(f"Fold {i}, Model {model_name}, MAE: {mae:.4f}, R²: {r2:.4f}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('heating_data_results_mae_r2.csv', index=False)

# Display overall summary
print("\nSummary Results for All Models (heating Data):")
summary = results_df.groupby('Model').agg({'MAE': 'mean', 'R²': 'mean'}).sort_values(by='MAE')
print(summary)


Fold 1, Model Linear Regression, MAE: 5.1285, R²: 0.4158
Fold 1, Model Random Forest, MAE: 2.8695, R²: 0.6677
Fold 1, Model XGBoost, MAE: 2.7304, R²: 0.7010
Fold 1, Model Stacking Regressor, MAE: 2.7941, R²: 0.7029
Fold 2, Model Linear Regression, MAE: 4.6839, R²: 0.4979
Fold 2, Model Random Forest, MAE: 2.2330, R²: 0.7723
Fold 2, Model XGBoost, MAE: 2.3624, R²: 0.7604
Fold 2, Model Stacking Regressor, MAE: 2.2203, R²: 0.7794
Fold 3, Model Linear Regression, MAE: 4.7205, R²: 0.5362
Fold 3, Model Random Forest, MAE: 2.1603, R²: 0.8138
Fold 3, Model XGBoost, MAE: 2.1716, R²: 0.8179
Fold 3, Model Stacking Regressor, MAE: 2.1324, R²: 0.8268
Fold 4, Model Linear Regression, MAE: 4.7932, R²: 0.4408
Fold 4, Model Random Forest, MAE: 2.2995, R²: 0.7602
Fold 4, Model XGBoost, MAE: 2.3938, R²: 0.7565
Fold 4, Model Stacking Regressor, MAE: 2.2701, R²: 0.7700
Fold 5, Model Linear Regression, MAE: 4.7752, R²: 0.4710
Fold 5, Model Random Forest, MAE: 2.1781, R²: 0.8128
Fold 5, Model XGBoost, MAE: 2.

## Random Forest

### hyper parameter tuning for heating random forest

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_rf = []

# Define hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_heating_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_heating_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize Random Forest and perform GridSearchCV
    rf = RandomForestRegressor(random_state=100)
    grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_rf = grid_search_rf.best_estimator_
    best_params = grid_search_rf.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_rf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_rf.append({
        'Fold': i,
        'Seed':2024,
        'Model': 'Random Forest',
        'Dataset': 'heating',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment': 2
    })

    print(f"Fold {i}, Model Random Forest, Dataset heating, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_rf_df = pd.DataFrame(results_rf)

# Save detailed results to CSV
results_rf_df.to_csv('random_forest_2024_heating_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Random Forest:")
summary_rf = results_rf_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_rf)

# Save summary to CSV
summary_rf.to_csv('random_forest_2024_heating_summary_results.csv', index=False)


Fold 1, Model Random Forest, Dataset Cooling, MAE: 2.8370, R²: 0.6721, Best Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Fold 2, Model Random Forest, Dataset Cooling, MAE: 2.2280, R²: 0.7719, Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Fold 3, Model Random Forest, Dataset Cooling, MAE: 2.1651, R²: 0.8123, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
Fold 4, Model Random Forest, Dataset Cooling, MAE: 2.3034, R²: 0.7583, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
Fold 5, Model Random Forest, Dataset Cooling, MAE: 2.1723, R²: 0.8089, Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}

Summary Results for Random Forest:
           Model  Dataset      MAE        R²
0  Random Forest  Cooling  2.34115  0.764705


### Feature Importance for Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Placeholder for feature importance results
feature_importance_results = []

# Fixed seed value

experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    results_rf_df = pd.read_csv('random_forest_2024_heating_hyperparameter_results_detailed.csv')
    best_params = results_rf_df.loc[(results_rf_df['Fold'] == fold) & (results_rf_df['Dataset'] == 'heating'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train Random Forest model with best parameters
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_rf.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'Random Forest',
        'Experiment': experiment  # Add experiment column
    })
    feature_importance_results.append(importance_df)

# Combine and save feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)
feature_importance_combined.to_csv(f'random_forest_2024_heating_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 42 and experiment 2 has been successfully saved.


### residual，predict，label，T_OUT and RH_OUT

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Fixed seed value

experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('random_forest_2024_heating_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'heating'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train Random Forest model with best parameters
    best_rf = RandomForestRegressor(random_state=100, **best_params_dict)
    best_rf.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_heating_filtered_test_2024_data.csv')
    test_data['Label'] = 'heating'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_rf.predict(X_test)
    test_data['Predicted'] = y_pred
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'Random Forest'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'random_forest_2024_heating_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to random_forest_42_cooling_residual.csv.


## XGBOOST

### hyper parameter tuning for heating XGBoost

In [None]:
!pip install pandas numpy scikit-learn xgboost
!pip install scikit-learn==1.0.2
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_xgb = []
experiment = 2  # Experiment number
# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_heating_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_heating_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Initialize XGBoost and perform GridSearchCV
    xgb = XGBRegressor(random_state=100)
    grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_xgb = grid_search_xgb.best_estimator_
    best_params = grid_search_xgb.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_xgb.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_xgb.append({
        'Fold': i,
        'Seed': 2024,
        'Model': 'XGBoost',
        'Dataset': 'heating',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment' : experiment
    })


    print(f"Fold {i}, Model XGBoost, Dataset heating, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_xgb_df = pd.DataFrame(results_xgb)

# Save detailed results to CSV
results_xgb_df.to_csv('xgboost_2024_heating_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for XGBoost:")
summary_xgb = results_xgb_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_xgb)

# Save summary to CSV
summary_xgb.to_csv('xgboost_2024_heating_summary_results.csv', index=False)


Fold 1, Model XGBoost, Dataset Cooling, MAE: 2.6462, R²: 0.6911, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 2, Model XGBoost, Dataset Cooling, MAE: 2.1649, R²: 0.7814, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'subsample': 0.8}
Fold 3, Model XGBoost, Dataset Cooling, MAE: 1.9818, R²: 0.8410, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 4, Model XGBoost, Dataset Cooling, MAE: 2.2875, R²: 0.7630, Best Params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150, 'subsample': 0.8}
Fold 5, Model XGBoost, Dataset Cooling, MAE: 2.1282, R²: 0.8167, Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150, 'subsample': 0.8}

Summary Results for XGBoost:
     Model  Dataset       MAE        R²
0  XGBoost  Cooling  2.241726  0.778636


### Feature Importance

In [None]:
import pandas as pd
from xgboost import XGBRegressor

# Placeholder for feature importance results
feature_importance_results = []

# Fixed random seed

experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_xgb_df = pd.read_csv('xgboost_2024_heating_hyperparameter_results_detailed.csv')
    best_params = results_xgb_df.loc[
        (results_xgb_df['Fold'] == fold) & (results_xgb_df['Dataset'] == 'heating'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Initialize and train the model with the best parameters
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Calculate feature importance
    feature_importances = best_xgb.feature_importances_
    feature_names = X_train.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances,
        'Fold': fold,
        'Seed': 2024,  # Add seed column
        'Model': 'XGBoost',
        'Experiment': experiment  # Add experiment column
    })
    feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'xgboost_2024_heating_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 42 and experiment 2 has been successfully saved.


### Residual analysis

In [None]:
import pandas as pd
from xgboost import XGBRegressor


experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Load the best model parameters for the current fold
    best_params_df = pd.read_csv('xgboost_2024_heating_hyperparameter_results_detailed.csv')
    best_params = best_params_df.loc[(best_params_df['Fold'] == fold) & (best_params_df['Dataset'] == 'heating'), 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Train XGBoost model with best parameters
    best_xgb = XGBRegressor(random_state=100, **best_params_dict)
    best_xgb.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_heating_filtered_test_2024_data.csv')
    test_data['Label'] = 'heating'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_xgb.predict(X_test)
    test_data['Predicted'] = y_pred
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'XGBoost'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'xgboost_2024_heating_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to xgboost_42_cooling_residual.csv.


## Stacking Regressor

### hyper parameter tuning for heating Stacking Regressor

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Placeholder for storing results
results_stacking = []

experiment = 2  # Experiment number
# Define hyperparameter grid for Stacking Regressor
param_grid_stacking = {
    'final_estimator__fit_intercept': [True, False],  # Meta-model parameter (LinearRegression)
    'rf__n_estimators': [50, 100],                   # Base model: Random Forest
    'rf__max_depth': [10, 20],                       # Base model: Random Forest
    'gb__learning_rate': [0.01, 0.1],                # Base model: Gradient Boosting
    'gb__max_depth': [3, 6]                          # Base model: Gradient Boosting
}

for i in range(1, 6):
    # Load datasets
    train_data = pd.read_csv(f'fold{i}_heating_filtered_train_2024_data.csv')
    test_data = pd.read_csv(f'fold{i}_heating_filtered_test_2024_data.csv')

    # Split features and target
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Define base models and Stacking Regressor
    base_models = [
        ('rf', RandomForestRegressor(random_state=100)),
        ('gb', GradientBoostingRegressor(random_state=100))
    ]
    meta_model = LinearRegression()
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Perform GridSearchCV on the Stacking Regressor
    grid_search_stacking = GridSearchCV(stacking, param_grid_stacking, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search_stacking.fit(X_train, y_train)

    # Get the best model and hyperparameters
    best_stacking = grid_search_stacking.best_estimator_
    best_params = grid_search_stacking.best_params_

    # Split test features and target
    X_test = test_data.drop(columns=['WH_RTU_Total'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate metrics
    y_pred = best_stacking.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results_stacking.append({
        'Fold': i,
        'Seed': 2024,
        'Model': 'Stacking Regressor',
        'Dataset': 'heating',
        'MAE': mae,
        'R²': r2,
        'Best Params': best_params,
        'Experiment': experiment
    })

    print(f"Fold {i}, Model Stacking Regressor, Dataset heating, MAE: {mae:.4f}, R²: {r2:.4f}, Best Params: {best_params}")

# Convert results to DataFrame
results_stacking_df = pd.DataFrame(results_stacking)

# Save detailed results to CSV
results_stacking_df.to_csv('stacking_2024_heating_hyperparameter_results_detailed.csv', index=False)

# Display overall summary
print("\nSummary Results for Stacking Regressor:")
summary_stacking = results_stacking_df.groupby(['Model', 'Dataset']).agg({'MAE': 'mean', 'R²': 'mean'}).reset_index()
print(summary_stacking)

# Save summary to CSV
summary_stacking.to_csv('stacking_2024_heating_summary_results.csv', index=False)


Fold 1, Model Stacking Regressor, Dataset heating, MAE: 2.8941, R²: 0.5911, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 10, 'rf__n_estimators': 50}
Fold 2, Model Stacking Regressor, Dataset heating, MAE: 2.8038, R²: 0.5630, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 10, 'rf__n_estimators': 100}
Fold 3, Model Stacking Regressor, Dataset heating, MAE: 3.1387, R²: 0.5293, Best Params: {'final_estimator__fit_intercept': False, 'gb__learning_rate': 0.1, 'gb__max_depth': 6, 'rf__max_depth': 20, 'rf__n_estimators': 50}
Fold 4, Model Stacking Regressor, Dataset heating, MAE: 3.0217, R²: 0.6004, Best Params: {'final_estimator__fit_intercept': True, 'gb__learning_rate': 0.01, 'gb__max_depth': 6, 'rf__max_depth': 20, 'rf__n_estimators': 100}
Fold 5, Model Stacking Regressor, Dataset heating, MAE: 2.9156, R²: 0.6347, Best Params: {'final_estimator__fi

### Feature Importance

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor

# Placeholder for feature importance results
feature_importance_results = []


experiment = 2  # Experiment number

# Section 1: Calculate Feature Importance
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Extract the best parameters from the results file
    results_stacking_df = pd.read_csv('stacking_2024_heating_hyperparameter_results_detailed.csv')
    best_params = results_stacking_df.loc[
        (results_stacking_df['Fold'] == fold) & (results_stacking_df['Dataset'] == 'heating'),
        'Best Params'
    ].iloc[0]
    best_params_dict = eval(best_params)  # Parse string into dictionary

    # Define base models
    base_models = [
        ('rf', RandomForestRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('rf__')})),
        ('gb', GradientBoostingRegressor(random_state=100, **{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('gb__')}))
    ]
    meta_model = LinearRegression(**{k.split('__')[1]: v for k, v in best_params_dict.items() if k.startswith('final_estimator__')})

    # Define Stacking Regressor and fit the model
    stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)
    stacking.fit(X_train, y_train)

    # Calculate feature importances from base models
    for model_name, model in stacking.named_estimators_.items():
        if hasattr(model, 'feature_importances_'):  # Check if the model supports feature importance
            feature_importances = model.feature_importances_
            feature_names = X_train.columns
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': feature_importances,
                'Model': model_name,
                'Fold': fold,
                'Seed': 2024,
                'Experiment': experiment  # Add experiment column
            })
            feature_importance_results.append(importance_df)

# Combine all feature importance results
feature_importance_combined = pd.concat(feature_importance_results, ignore_index=True)

# Save the feature importance results to a file
feature_importance_combined.to_csv(f'stacking_2024_heating_feature_importance_experiment_{experiment}.csv', index=False)

print(f"Feature importance for seed 2024 and experiment {experiment} has been successfully saved.")

Feature importance for seed 2024 and experiment 2 has been successfully saved.


### Residual Analysis

In [None]:
import pandas as pd
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression


experiment = 2  # Experiment number
merged_data = []  # Placeholder for storing final merged data

# Load the best model parameters for each fold
best_params_df = pd.read_csv('stacking_2024_heating_hyperparameter_results_detailed.csv')

# Loop through each fold
for fold in range(1, 6):
    # Load training data
    train_data = pd.read_csv(f'fold{fold}_heating_filtered_train_2024_data.csv')
    X_train = train_data.drop(columns=['WH_RTU_Total'])
    y_train = train_data['WH_RTU_Total']

    # Get the best parameters for the current fold
    best_params = best_params_df.loc[best_params_df['Fold'] == fold, 'Best Params'].iloc[0]
    best_params_dict = eval(best_params)

    # Define base models and Stacking Regressor with best parameters
    base_models = [
        ('rf', RandomForestRegressor(random_state=100,
                                     n_estimators=best_params_dict['rf__n_estimators'],
                                     max_depth=best_params_dict['rf__max_depth'])),
        ('gb', GradientBoostingRegressor(random_state=100,
                                         learning_rate=best_params_dict['gb__learning_rate'],
                                         max_depth=best_params_dict['gb__max_depth']))
    ]
    meta_model = LinearRegression(fit_intercept=best_params_dict['final_estimator__fit_intercept'])
    best_stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Train the Stacking Regressor
    best_stacking.fit(X_train, y_train)

    # Load test data
    test_data = pd.read_csv(f'fold{fold}_heating_filtered_test_2024_data.csv')
    test_data['Label'] = 'heating'  # Dynamically add label based on dataset type

    # Split features and target
    X_test = test_data.drop(columns=['WH_RTU_Total', 'Label'])
    y_test = test_data['WH_RTU_Total']

    # Predict and calculate residuals
    y_pred = best_stacking.predict(X_test)
    test_data = test_data.reset_index(drop=True)  # Reset test_data index
    y_pred_series = pd.Series(y_pred, index=test_data.index)  # Align y_pred with test_data index
    test_data['Predicted'] = y_pred_series
    test_data['Residual'] = test_data['WH_RTU_Total'] - test_data['Predicted']

    # Select and rename required columns
    if 'T_out' in test_data.columns and 'RH_out' in test_data.columns:
        selected_data = test_data[['WH_RTU_Total', 'Predicted', 'Residual', 'T_out', 'RH_out', 'Label']].rename(columns={
            'WH_RTU_Total': 'Actual'
        })
    else:
        missing_columns = [col for col in ['T_out', 'RH_out'] if col not in test_data.columns]
        raise ValueError(f"{', '.join(missing_columns)} is/are missing in test data for fold {fold}")

    # Add fold, seed, and model columns
    selected_data['Fold'] = fold
    selected_data['Seed'] = 2024
    selected_data['Model'] = 'StackingRegressor'
    selected_data['Experiment'] = experiment

    # Append to final merged data
    merged_data.append(selected_data)
    print(f"Processed fold {fold}.")

# Combine all folds into a single DataFrame
merged_final_data = pd.concat(merged_data, ignore_index=True)

# Save the merged data
output_file = 'stacking_2024_heating_residual.csv'
merged_final_data.to_csv(output_file, index=False)

print(f"All folds processed and merged data saved successfully to {output_file}.")


Processed fold 1.
Processed fold 2.
Processed fold 3.
Processed fold 4.
Processed fold 5.
All folds processed and merged data saved successfully to stacking_2024_heating_residual.csv.
