# Hyperparameter Tuning

## Import packages and data

In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

In [2]:
# importlib.reload(src.model)

# Example for loading data (you can skip this if data is already loaded)
train_data_path = '../data/CMaps/train_FD001.txt'
test_data_path = '../data/CMaps/test_FD001.txt'
rul_data_path = '../data/CMaps/RUL_FD001.txt'

# Create column names
col_names = [
    'engine_id', 'time_in_cycles', 
    'operational_setting_1', 'operational_setting_2', 'operational_setting_3'
] + [f'sensor_{i}' for i in range(1, 27)]  # This creates sensor_1 to sensor_21

# Read the data into pandas dataframes
train_df = pd.read_csv(train_data_path, sep=' ', header=None, names=col_names)
test_df = pd.read_csv(test_data_path, sep=' ', header=None, names=col_names)
rul_df = pd.read_csv(rul_data_path, header=None, names=['RUL'])

In [3]:
# Drop NaN values
train_df.dropna(axis=1, how='all', inplace=True)
test_df.dropna(axis=1, how='all', inplace=True)

# Add RUL values to data
test_df['RUL'] = test_df['engine_id'].map(lambda x: rul_df.loc[x - 1, 'RUL'])
train_df['RUL'] = train_df['engine_id'].map(lambda x: rul_df.loc[x - 1, 'RUL'])

# train_df, test_df = preprocess_data(train_df, test_df)

In [4]:
# Define features and target variable
X = train_df.drop(columns=['RUL', 'engine_id', 'time_in_cycles'])  # Features (drop non-feature columns) - all columns except the target column
y = train_df['RUL']  # Target (RUL)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Tuning Random Forest
Hyperparameter Tuning: Random Forest has several hyperparameters that you can tune to improve performance. Common ones include:

- `n_estimators`: The number of trees in the forest.

- `max_depth`: Controls the depth of each tree (prevent overfitting).

- `min_samples_split`: The minimum number of samples required to split an internal node.

- `min_samples_leaf`: The minimum number of samples required to be at a leaf node.

- `max_features`: The number code of features to consider when looking for the best split.

<div class="alert alert-block alert-info">
<b>Tip:</b> We can use GridSearchCV or RandomizedSearchCV from sklearn.model_selection to perform hyperparameter tuning.
</div>

Here we'll use `GridSearchCV`:

In [5]:
rf_model = RandomForestRegressor(random_state=42)

# Hyperparameters for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

print("Best Random Forest Parameters:", grid_search.best_params_)


108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\amurd\miniconda3\envs\pm-env\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\amurd\miniconda3\envs\pm-env\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\amurd\miniconda3\envs\pm-env\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\amurd\miniconda3\envs\pm-env\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in val

Best Random Forest Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [6]:
# Initialize and train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=20, min_samples_leaf=2, min_samples_split=2)
rf_model.fit(X_train, y_train)
val_predictions = rf_model.predict(X_val)
rf_mae = mean_absolute_error(y_val, val_predictions)
rf_rmse = mean_squared_error(y_val, val_predictions, squared=False)
print(f"Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}")

Random Forest - MAE: 35.61750525752866, RMSE: 41.09045105150293


In [7]:
# Initialize and train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=20, min_samples_leaf=2, min_samples_split=2, max_features='sqrt')
rf_model.fit(X_train, y_train)
val_predictions = rf_model.predict(X_val)
rf_mae = mean_absolute_error(y_val, val_predictions)
rf_rmse = mean_squared_error(y_val, val_predictions, squared=False)
print(f"Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}")

Random Forest - MAE: 35.49806244842466, RMSE: 40.84745326448685


## Tuning XGBoost
Hyperparameter Tuning: XGBoost has several hyperparameters that affect its performance:

- learning_rate: The step size used to update the weights. Smaller values might yield better results, but require more rounds of boosting.

- n_estimators: The number of boosting rounds.

- max_depth: The maximum depth of a tree.

- subsample: Fraction of samples to use for each boosting round.

- colsample_bytree: Fraction of features to use for each tree.

We will again use `GridSearchCV`

In [8]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(xgb.XGBRegressor(), param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_xgb_model = grid_search.best_estimator_

In [9]:
print("Best Random Forest Parameters:", best_xgb_model)

Best Random Forest Parameters: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [10]:
# Hyperparameters for tuning
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

best_rmse = np.inf
best_params = {}

for learning_rate in param_grid['learning_rate']:
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for subsample in param_grid['subsample']:
                for colsample_bytree in param_grid['colsample_bytree']:
                    params = {
                        'objective': 'reg:squarederror',
                        'eval_metric': 'rmse',
                        'learning_rate': learning_rate,
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree
                    }
                    
                    # Train the model
                    dtrain = xgb.DMatrix(X_train, label=y_train)
                    model = xgb.train(params, dtrain, num_boost_round=n_estimators)
                    
                    # Evaluate on validation set
                    dval = xgb.DMatrix(X_val)
                    val_predictions = model.predict(dval)
                    
                    # Calculate MAE and RMSE
                    mae = mean_absolute_error(y_val, val_predictions)
                    rmse = mean_squared_error(y_val, val_predictions, squared=False)
                    
                    # If this model has a lower RMSE, store it
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params = params
                        best_model = model

print(f"Best XGBoost Parameters: {best_params}")

# Now we have the best model, we can evaluate it on the validation set
dval = xgb.DMatrix(X_val)
val_predictions = best_model.predict(dval)

# Calculate MAE and RMSE for the best model
best_mae = mean_absolute_error(y_val, val_predictions)
best_rmse = mean_squared_error(y_val, val_predictions, squared=False)

print(f"Best Model MAE: {best_mae}")
print(f"Best Model RMSE: {best_rmse}")

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are

Best XGBoost Parameters: {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'learning_rate': 0.01, 'n_estimators': 200, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 1.0}
Best Model MAE: 35.70798661860768
Best Model RMSE: 40.8056770622987


## LSTM tuning
Model Complexity: LSTM models can require a deeper network to capture more complex patterns. We consider adding more layers or units to the LSTM.

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

# Reshape data for LSTM
X_train_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))

# List of possible hyperparameters to try
epochs_list = [10, 20, 50]
batch_sizes = [16, 32, 64]

best_mae = np.inf
best_rmse = np.inf
best_model = None

for epochs in epochs_list:
    for batch_size in batch_sizes:
        # Build and compile the LSTM model
        model = Sequential()
        model.add(LSTM(units=64, return_sequences=False, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
        model.add(Dropout(0.2))
        model.add(Dense(1))
        
        model.compile(optimizer=Adam(), loss='mean_squared_error')
        
        # Train the model
        model.fit(X_train_lstm, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
        
        # Evaluate the model
        X_val_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
        val_predictions = model.predict(X_val_lstm)

    # Evaluate the model
        mae = mean_absolute_error(y_train, val_predictions)
        rmse = mean_squared_error(y_train, val_predictions, squared=False)
        # mae, rmse = evaluate_model(model, X_train, y_train)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_mae = mae
            best_model = model

print(f"Best LSTM Parameters - Epochs: {epochs}, Batch Size: {batch_size}, MAE: {best_mae}, RMSE: {best_rmse}")

Best LSTM Parameters - Epochs: 50, Batch Size: 64, MAE: 37.15786748526458, RMSE: 41.96250422418167
