In [None]:
!pip install catboost
!pip install pygam


# The code snippet below is to get permutation importance graphs of both train and test sets individually for the defined models to plot. You can change the models_to_plot variable according to your model or define a sorting criteria to choose the models to be graphed.

In [None]:
import pandas as pd  # Importing pandas library for data manipulation and analysis
import numpy as np  # Importing numpy library for numerical operations
import warnings  # Importing warnings to manage warnings during the runtime
import os  # Importing os for operating system dependent functionalities

# Importing necessary classes and functions from sklearn for model building, preprocessing, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt  # Importing matplotlib for plotting graphs

# Loading the dataset
data = pd.read_csv('path_to_your_data')

# Transforming the target variable by applying a logarithmic function to make the distribution more symmetric
data['Target_log'] = np.log(data['Target'] + 1)

# Data preparation by dropping irrelevant columns
X = data.drop(columns=['List_of_columns_to_drop'])
y = data['Target_log']  # Specifying the target variable

# Encoding categorical variables
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Best parameters
rf_best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
xgb_best_params = {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
bagging_best_params = {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 30}
dt_best_params = {'criterion': 'friedman_mse', 'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}

# Create a dictionary of models
regressors = {
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Bagging': BaggingRegressor(n_estimators=30, max_features=1.0, max_samples=1.0, random_state=42),
    'Decision Tree': DecisionTreeRegressor(criterion='friedman_mse', max_depth=9, min_samples_leaf=2, min_samples_split=2, splitter='best', random_state=42),
    'Support Vector': SVR(),
    'K-Neighbors': KNeighborsRegressor(),
    'RidgeCV': RidgeCV(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SGD': SGDRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'BayesianRidge': BayesianRidge(),
    'XGBoostRegressor': XGBRegressor(colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8, random_state=42),
    'LightGBMRegressor': LGBMRegressor(),
    'SVR_poly': SVR(kernel='poly'),
    'SVR_sigmoid': SVR(kernel='sigmoid'),
    'SVR_rbf': SVR(kernel='rbf'),
    'HuberRegressor': HuberRegressor(),
    'RANSACRegressor': RANSACRegressor(),
    'TheilSenRegressor': TheilSenRegressor(),
    'KernelRidge': KernelRidge(),
    'OMP': OrthogonalMatchingPursuit(),
    'PoissonRegressor': PoissonRegressor(),
    'TweedieRegressor': TweedieRegressor()
}
# Fit models and extract permutation importances
perm_importances = {}  # Initializing the dictionary
def plot_permutation_importance(model, X_train, y_train, X_test, y_test, title='', save_path=''):
    # Compute permutation importance for training set
    result_train = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx_train = result_train.importances_mean.argsort()

    # Compute permutation importance for test set
    result_test = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx_test = result_test.importances_mean.argsort()

    # Convert the results to DataFrame for better visualization
    importances_train = pd.DataFrame(result_train.importances[sorted_idx_train].T, columns=X.columns[sorted_idx_train])
    importances_test = pd.DataFrame(result_test.importances[sorted_idx_test].T, columns=X.columns[sorted_idx_test])

    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(12, 12))

    # Plotting for training set
    importances_train.plot.box(vert=False, whis=10, ax=ax[0])
    ax[0].set_title(f"Permutation Importances (train set) - {title}")
    ax[0].axvline(x=0, color="k", linestyle="--")
    ax[0].set_xlabel("Decrease in accuracy score")

    # Plotting for test set
    importances_test.plot.box(vert=False, whis=10, ax=ax[1])
    ax[1].set_title(f"Permutation Importances (test set) - {title}")
    ax[1].axvline(x=0, color="k", linestyle="--")
    ax[1].set_xlabel("Decrease in accuracy score")

    plt.tight_layout()
    plt.show()
    if save_path:
        png_path = os.path.join(save_path, f"{title}_Permutation_Importance.png")
        tiff_path = os.path.join(save_path, f"{title}_Permutation_Importance.tiff")
#decrease the dpi if the resolution is larger than it should be
        fig.savefig(png_path, dpi=1000)
        fig.savefig(tiff_path, dpi=1000)

    plt.close(fig)  # close the figure

# Create directories if they don't exist
save_directory = "path_to_save_directory"
if not os.path.exists(save_directory):
    os.makedirs(save_directory)
    # Models to visualize
models_to_plot = ['XGBoostRegressor', 'Random Forest', 'Bagging', 'Gradient Boosting',"LightGBMRegressor"]
# Fit the models and visualize
for name in models_to_plot:
    model = regressors[name]
    model.fit(X_train, y_train)
    plot_permutation_importance(model, X_train, y_train, X_test, y_test, title=name, save_path=save_directory)



# To graph feature importances of which train set and test set located in one graph you can use the snippet below.

In [None]:
import pandas as pd  # Importing pandas library for data manipulation and analysis
import numpy as np  # Importing numpy library for numerical operations
import warnings  # Importing warnings to manage warnings during the runtime
import os  # Importing os for operating system dependent functionalities

# Importing necessary classes and functions from sklearn for model building, preprocessing, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt  # Importing matplotlib for plotting graphs

# Loading the dataset
data = pd.read_csv('path_to_your_data')

# Transforming the target variable by applying a logarithmic function to make the distribution more symmetric
data['Target_log'] = np.log(data['Target'] + 1)

# Data preparation by dropping irrelevant columns
X = data.drop(columns=['List_of_columns_to_drop'])
y = data['Target_log']  # Specifying the target variable

# Encoding categorical variables
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Best parameters
rf_best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
xgb_best_params = {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
bagging_best_params = {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 30}
dt_best_params = {'criterion': 'friedman_mse', 'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}

# Create a dictionary of models
regressors = {
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Bagging': BaggingRegressor(n_estimators=30, max_features=1.0, max_samples=1.0, random_state=42),
    'Decision Tree': DecisionTreeRegressor(criterion='friedman_mse', max_depth=9, min_samples_leaf=2, min_samples_split=2, splitter='best', random_state=42),
    'Support Vector': SVR(),
    'K-Neighbors': KNeighborsRegressor(),
    'RidgeCV': RidgeCV(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SGD': SGDRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'BayesianRidge': BayesianRidge(),
    'XGBoostRegressor': XGBRegressor(colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8, random_state=42),
    'LightGBMRegressor': LGBMRegressor(),
    'SVR_poly': SVR(kernel='poly'),
    'SVR_sigmoid': SVR(kernel='sigmoid'),
    'SVR_rbf': SVR(kernel='rbf'),
    'HuberRegressor': HuberRegressor(),
    'RANSACRegressor': RANSACRegressor(),
    'TheilSenRegressor': TheilSenRegressor(),
    'KernelRidge': KernelRidge(),
    'OMP': OrthogonalMatchingPursuit(),
    'PoissonRegressor': PoissonRegressor(),
    'TweedieRegressor': TweedieRegressor()
}
# Fit models and extract permutation importances
perm_importances = {}  # Initializing the dictionary
from matplotlib.lines import Line2D

def plot_overlapped_permutation_importance(model, X_train, y_train, X_test, y_test, title='', save_path=''):
    # Compute permutation importance for training set
    result_train = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx_train = result_train.importances_mean.argsort()

    # Compute permutation importance for test set
    result_test = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx_test = result_test.importances_mean.argsort()

    # Convert the results to DataFrame for better visualization
    importances_train = pd.DataFrame(result_train.importances[sorted_idx_train].T, columns=X.columns[sorted_idx_train])
    importances_test = pd.DataFrame(result_test.importances[sorted_idx_test].T, columns=X.columns[sorted_idx_test])

    fig, ax = plt.subplots(figsize=(12, 12))

    # Plotting for training set
    importances_train.boxplot(ax=ax, vert=False, positions=np.arange(len(importances_train.columns))*2.0-0.4, widths=0.4, boxprops=dict(color='blue'), medianprops=dict(color='blue'), whiskerprops=dict(color='blue'), capprops=dict(color='blue'))

    # Plotting for test set
    importances_test.boxplot(ax=ax, vert=False, positions=np.arange(len(importances_test.columns))*2.0+0.4, widths=0.4, boxprops=dict(color='red'), medianprops=dict(color='red'), whiskerprops=dict(color='red'), capprops=dict(color='red'))

    # Tweaking the plot appearance
    ax.set_yticks(np.arange(len(importances_train.columns))*2.0)
    ax.set_yticklabels(importances_train.columns)
    ax.axvline(x=0, color="k", linestyle="--")
    ax.set_xlabel("Decrease in accuracy score")
    ax.set_title(f"Overlapped Permutation Importances - {title}")
    ax.grid(False)
    # Custom legend
    custom_lines = [Line2D([0], [0], color="blue", lw=4), Line2D([0], [0], color="red", lw=4)]
    ax.legend(custom_lines, ['Train', 'Test'], loc="lower right")

    plt.tight_layout()
    plt.show()

    if save_path:
        png_path = os.path.join(save_path, f"{title}_Overlapped_Permutation_Importance3.png")
        tiff_path = os.path.join(save_path, f"{title}_Overlapped_Permutation_Importance3.tiff")

        fig.savefig(png_path, dpi=1000)
        fig.savefig(tiff_path, dpi=1000)

    plt.close(fig)  # close the figure
# Create directories if they don't exist
save_directory = "path_to_save_directory"
if not os.path.exists(save_directory):
    os.makedirs(save_directory)
    # Models to visualize
models_to_plot = ['XGBoostRegressor', 'Random Forest', 'Bagging', 'Gradient Boosting',"LightGBMRegressor"]
# Fit the models and visualize
for name in models_to_plot:
    model = regressors[name]
    model.fit(X_train, y_train)
    plot_permutation_importance(model, X_train, y_train, X_test, y_test, title=name, save_path=save_directory)

# You can use the code below to get hyperparameters of all models that is used for the regression.In addition to test and train R^2 values you can obtain other regression metrics such as MAE, MSE, RMSE, MAPE with the code below.

In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import (HuberRegressor, RANSACRegressor, TheilSenRegressor,
                                  OrthogonalMatchingPursuit, PoissonRegressor,
                                  TweedieRegressor, RidgeCV, Lasso,
                                  ElasticNet, SGDRegressor, BayesianRidge)
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,
                              AdaBoostRegressor, BaggingRegressor, StackingRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pygam import LinearGAM, s, f

# Define a function to calculate Mean Absolute Percentage Error
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-6))) * 100

# Loading the dataset
data = pd.read_csv('path_to_your_data')

# Transforming the target variable by applying a logarithmic function to make the distribution more symmetric
data['Target_log'] = np.log(data['Target'] + 1)

# Data preparation by dropping irrelevant columns
X = data.drop(columns=['List_of_columns_to_drop'])
y = data['Target_log']  # Specifying the target variable


# Encode categorical features
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a dictionary with models
regressors = {
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Bagging': BaggingRegressor(n_estimators=30, max_features=1.0, max_samples=1.0, random_state=42),
    'Decision Tree': DecisionTreeRegressor(criterion='friedman_mse', max_depth=9, min_samples_leaf=2, min_samples_split=2, splitter='best', random_state=42),
    'Support Vector': SVR(),
    'K-Neighbors': KNeighborsRegressor(),
    'RidgeCV': RidgeCV(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SGD': SGDRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'BayesianRidge': BayesianRidge(),
    'XGBoostRegressor': XGBRegressor(colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8, random_state=42),
    'LightGBMRegressor': LGBMRegressor(),
    'SVR_poly': SVR(kernel='poly'),
    'SVR_sigmoid': SVR(kernel='sigmoid'),
    'SVR_rbf': SVR(kernel='rbf'),
    'HuberRegressor': HuberRegressor(),
    'SVR_linear': SVR(kernel='linear'),
    'RANSACRegressor': RANSACRegressor(),
    'TheilSenRegressor': TheilSenRegressor(),
    'KernelRidge': KernelRidge(),
    'OMP': OrthogonalMatchingPursuit(),
    'PoissonRegressor': PoissonRegressor(),
    'TweedieRegressor': TweedieRegressor()
}

# Initialize lists to store results
results = []
stacked_results = []

# Fit models and evaluate performance
# Extend the script to save hyperparameters of each model
for model_name, model in regressors.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Extract hyperparameters of the model
    hyperparameters = model.get_params()

    # Calculate metrics and store them along with hyperparameters in the results list
    results.append({
        'Model': model_name,
        'Hyperparameters': hyperparameters,  # Save hyperparameters as a nested dictionary
        'Train R2 Score': r2_score(y_train, y_train_pred),
        'Test R2 Score': r2_score(y_test, y_test_pred),
        'Train MSE': mean_squared_error(y_train, y_train_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Train MAPE': mean_absolute_percentage_error(y_train, y_train_pred),
        'Test MSE': mean_squared_error(y_test, y_test_pred),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
        'Test MAPE': mean_absolute_percentage_error(y_test, y_test_pred)
    })
# Define and fit a GAM model
gam = LinearGAM(s(0) + s(1)).fit(X_train, y_train)
gam_train_pred = gam.predict(X_train)
gam_test_pred = gam.predict(X_test)

# Also, save the hyperparameters of the GAM model
gam_hyperparameters = {'terms': str(gam.terms)}  # Extract relevant hyperparameters or configurations

results.append({
    'Model': 'GAM',
    'Hyperparameters': gam_hyperparameters,
    'Train R2 Score': r2_score(y_train, gam_train_pred),
    'Test R2 Score': r2_score(y_test, gam_test_pred),
    'Train MSE': mean_squared_error(y_train, gam_train_pred),
    'Train RMSE': np.sqrt(mean_squared_error(y_train, gam_train_pred)),
    'Train MAE': mean_absolute_error(y_train, gam_train_pred),
    'Train MAPE': mean_absolute_percentage_error(y_train, gam_train_pred),
    'Test MSE': mean_squared_error(y_test, gam_test_pred),
    'Test RMSE': np.sqrt(mean_squared_error(y_test, gam_test_pred)),
    'Test MAE': mean_absolute_error(y_test, gam_test_pred),
    'Test MAPE': mean_absolute_percentage_error(y_test, gam_test_pred)
})

# For stacked models, you can save the names of the models being stacked as hyperparameters
for stacked_model in stacked_results:
    model_names = [estimator[0] for estimator in stacking_regressor.estimators_]
    stacked_model['Hyperparameters'] = {'Models': model_names}

# Save results to DataFrames and then to CSV or Excel
results_df = pd.DataFrame(results)
stacked_results_df = pd.DataFrame(stacked_results)
results_df.to_csv('path_to_save_your_results/results_with_hyperparameters.csv', index=False)
stacked_results_df.to_csv('path_to_save_your_results_of_stacked_models/stacked_results_with_hyperparameters.csv', index=False)

