In [12]:
!pip install xgboost
import pandas as pd
import xgboost



In [13]:
# Load the data from the 'data_processed.csv' file 
X_train = pd.read_csv('X_train_C.csv', low_memory=False)
X_test = pd.read_csv('X_test_C.csv', low_memory=False)
Y_train = pd.read_csv('Y_train_C.csv', low_memory=False)
Y_test = pd.read_csv('Y_test_C.csv', low_memory=False)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Hyperparameters to try
alpha_values = np.arange(0.1, 11.0, 0.5)
n_estimators_values = np.arange(10, 31, 10)

# Define a list of estimators with their respective hyperparameters
estimators = [
    (LinearRegression(), {'estimator__fit_intercept': [True, False]}),
    (Ridge(), {'estimator__alpha': alpha_values, 'estimator__fit_intercept': [True, False]}),
    (Lasso(), {'estimator__alpha': alpha_values, 'estimator__fit_intercept': [True, False]}),
    (RandomForestRegressor(), {'estimator__n_estimators': n_estimators_values})
]

# Different k values for k-fold cross-validation
#cv_values = np.arange(3, 10, 3)
cv_values = [5]

# Store the best model, its parameters, and performance
best_model = None
best_cv = None
best_mse = float('inf')
all_results = []

# Iterate over different values of k for cross-validation
for cv in cv_values:
    results = []

    # Iterate through the estimators and their hyperparameters
    for estimator, param_grid in estimators:
        regressor = MultiOutputRegressor(estimator)
        grid_search = GridSearchCV(regressor, param_grid, cv=cv, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, Y_train)
        
        Y_pred = grid_search.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        
        results.append({
            'CV': cv,
            'Estimator': estimator.__class__.__name__,
            'Best Parameters': grid_search.best_params_,
            'Mean Squared Error': mse
        })
        
        if mse < best_mse:
            best_mse = mse
            best_cv = cv
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_

    all_results.extend(results)

# Convert all results to a DataFrame
all_results_df = pd.DataFrame(all_results)

# Display the results
print(all_results_df)

# Display the overall best model and its performance
print("\nOverall Best Model Results:")
print(f"Best CV: {best_cv}")
print(f"Best Mean Squared Error: {best_mse}")
print(f"Best Hyperparameters: {best_params}")
print(f"Best Model: {best_model}")

KeyboardInterrupt: 

CV             Estimator                                              Best Parameters  Mean Squared Error
  5      LinearRegression                          {'estimator__fit_intercept': False}            1.768620
  5                 Ridge  {'estimator__alpha': 2.1, 'estimator__fit_intercept': True}            1.768520
  5                 Lasso {'estimator__alpha': 0.1, 'estimator__fit_intercept': False}            2.180528
  5 RandomForestRegressor                              {'estimator__n_estimators': 30}            0.315175

Overall Best Model Results:
Best CV: 5
Best Mean Squared Error: 0.31517478220480416
Best Hyperparameters: {'estimator__n_estimators': 30}
Best Model: MultiOutputRegressor(estimator=RandomForestRegressor(n_estimators=30))

We can see that the best model is for Random Forest Regressor. Indeed, for our specific case of predicting structural wall designs, models that can capture non-linear relationships (like Random Forest or GBM) might be particularly effective, given that structural engineering data often involves complex interactions between variables.

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Define the parameter grid
param_grid = {
    'max_depth': range(3, 10, 2),  # Example: depths 3, 5, 7, 9
    'alpha': [0, 0.5, 1, 5, 10],  # Different alpha values
    'n_estimators': range(10, 110, 10)  # Example: 10, 20, ..., 100
}

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror')

# Grid search with cross-validation
# Set n_jobs=-1 to use all available CPU cores
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, 
                           verbose=1, n_jobs=-1)

# Fit grid search
grid_search.fit(X_train, Y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Best model
best_model = grid_search.best_estimator_

# Predict with the best model
Y_pred = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(Y_test, Y_pred)
print(f"Best Model Mean Squared Error: {mse}")

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Best parameters: {'alpha': 10, 'max_depth': 9, 'n_estimators': 100}
Best Model Mean Squared Error: 0.3062257948552232

I will only run the best model found now

In [14]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Define the parameter grid
param_grid = {
    'max_depth': [9],  # Example: depths 3, 5, 7, 9
    'alpha': [10],  # Different alpha values
    'n_estimators': [100]  # Example: 10, 20, ..., 100
}

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror')

# Grid search with cross-validation
# Set n_jobs=-1 to use all available CPU cores
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, 
                           verbose=1, n_jobs=-1)

# Fit grid search
grid_search.fit(X_train, Y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Best model
best_model = grid_search.best_estimator_

# Predict with the best model
Y_pred = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(Y_test, Y_pred)
print(f"Best Model Mean Squared Error: {mse}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters: {'alpha': 10, 'max_depth': 9, 'n_estimators': 100}
Best Model Mean Squared Error: 0.331871684919854
