In [1]:
# Librerías necesarias
import os
import re  # Import the regular expression module

import pandas as pd
import numpy as np
import math

import matplotlib
matplotlib.use('TKAgg')
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns

-------------------------------------------------------------------------------------------------------------------------

## 2. Machine Learning

### 2.1. Librerías

In [2]:
import warnings

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.inspection import permutation_importance
from sklearn.exceptions import ConvergenceWarning

### 2.2. Cargar y convertir los datos preprocesados en matrices X, M y P

In [3]:
# Definir las rutas base y de las carpetas
base_path = os.getcwd()  # Se asume que el notebook se ejecuta desde la carpeta 'ML'
db_path = os.path.join(base_path, "DB_ML")
fig_path = os.path.join(base_path, "Figuras_ML")
model_path = os.path.join(base_path, "Modelos_ML")

# Ruta al archivo de la base de datos
data_file = os.path.join(db_path, "design_DB_preprocessed_200_Optimizado.csv")
print(data_file)

# Ruta al archivo de las figuras
figure_path = os.path.join(fig_path, "200_MOT_Optimizado")
print(figure_path)

# Ruta al archivo de los modelos
modelo_path = os.path.join(model_path, "200_MOT_Optimizado")
print(modelo_path)

C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\DB_ML\design_DB_preprocessed_200_Optimizado.csv
C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\Figuras_ML\200_MOT_Optimizado
C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\Modelos_ML\200_MOT_Optimizado


In [4]:
# Lectura del archivo CSV
try:
    df = pd.read_csv(data_file)
    print("Archivo cargado exitosamente.")
except FileNotFoundError:
    print("Error: Archivo no encontrado. Revisa la ruta del archivo.")
except pd.errors.ParserError:
    print("Error: Problema al analizar el archivo CSV. Revisa el formato del archivo.")
except Exception as e:
    print(f"Ocurrió un error inesperado: {e}")

# Función para limpiar nombres de archivo inválidos
def clean_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "_", name)

Archivo cargado exitosamente.


In [5]:
# Separa las columnas en matrices X, M y P
X_cols = [col for col in df.columns if col.startswith('x')]
M_cols = [col for col in df.columns if col.startswith('m')]
P_cols = [col for col in df.columns if col.startswith('p')]

X = df[X_cols].copy()
M = df[M_cols].copy()
P = df[P_cols].copy()

# Transforma todos los datos de X, M y P a numéricos
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

for col in M.columns:
    M[col] = pd.to_numeric(M[col], errors='coerce')

for col in P.columns:
    P[col] = pd.to_numeric(P[col], errors='coerce')

In [6]:
# Mostrar las primeras filas del DataFrame
print("\nPrimeras filas del DataFrame:")
display(X.head())
display(M.head())
display(P.head())


Primeras filas del DataFrame:


Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh
0,51.69,21.32,25.14,3.04,11.26,3.17,18,4
1,55.385006,21.566435,31.92032,2.741117,10.541636,2.004243,6,6
2,58.660824,24.61077,17.825636,3.236242,11.840792,2.327503,11,8
3,59.74599,22.251184,21.64142,2.75513,13.803262,3.92936,13,5
4,47.005493,25.875113,11.108705,3.487073,5.880454,2.494527,5,8


Unnamed: 0,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
0,20.32,8.148571,3.925,9.4,21.92,48.841316
1,20.566435,8.922362,6.36765,9.597938,21.324854,21.83812
2,23.61077,10.106636,5.184235,10.996324,24.146177,42.16453
3,21.251184,9.383442,4.944141,9.93681,24.928854,34.987854
4,24.875113,10.508077,4.684736,11.565787,18.81801,47.85282


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
0,0.546655,0.11,3960.0,51.624046,0.623453,3276.9807,80.95138,5060.14,91.475174
1,0.626095,0.11,3960.0,23.672848,0.370497,11313.051,89.63181,10000.0,87.564896
2,0.536246,0.11,3960.0,45.401558,0.556664,8573.148,88.79658,10000.0,90.32792
3,0.586261,0.11,3960.0,36.230183,0.516459,5139.8037,85.65474,7697.3105,91.06665
4,0.285555,0.11,3960.0,58.693073,0.182915,30853.463,89.73517,10000.0,84.311806


In [7]:
# Estadísticas descriptivas del DataFrame
print("\nEstadísticas descriptivas:")
display(X.describe())
display(M.describe())
display(P.describe())


Estadísticas descriptivas:


Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh
count,153.0,153.0,153.0,153.0,153.0,153.0,153.0,153.0
mean,57.206561,24.974375,24.535949,2.723073,9.879684,3.582695,10.830065,4.732026
std,3.170857,3.358642,7.364933,0.369546,2.394958,0.75068,5.260067,1.835349
min,47.005493,21.222431,10.159872,2.00033,5.087583,2.004243,5.0,3.0
25%,55.637726,22.560484,19.477915,2.495223,7.960513,3.032784,6.0,3.0
50%,58.381973,23.923805,23.839148,2.719184,9.677506,3.827702,10.0,4.0
75%,59.86078,26.364328,29.865805,2.966022,12.053759,4.121573,13.0,6.0
max,59.998142,35.462166,39.637146,3.491216,14.953374,4.992824,30.0,9.0


Unnamed: 0,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
count,153.0,153.0,153.0,153.0,153.0,153.0
mean,23.974375,11.392694,6.236409,11.306419,22.366872,35.843304
std,3.358642,2.455982,2.21543,1.674829,2.216387,11.478902
min,20.222431,8.000146,3.500736,9.341373,17.330286,9.223359
25%,21.560484,9.794789,4.355331,10.140299,20.422577,26.337347
50%,22.923805,10.83485,5.817081,10.789428,22.818666,35.45557
75%,25.364328,12.432346,7.597222,12.016282,24.16009,45.94149
max,34.462166,19.30075,12.659933,16.54583,26.144396,52.88478


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
count,153.0,153.0,153.0,153.0,153.0,153.0,153.0,153.0,153.0
mean,0.600628,0.11,3960.0,39.211719,0.450224,8013.482863,87.826765,7715.212127,89.414906
std,0.144921,1.3923360000000002e-17,0.0,12.5183,0.182741,4771.283969,3.542545,2102.781898,2.737478
min,0.285555,0.11,3960.0,9.662383,0.118897,2808.8538,71.770645,4181.368,79.74299
25%,0.524043,0.11,3960.0,28.259974,0.321938,4341.959,86.00311,5627.8086,87.564896
50%,0.590717,0.11,3960.0,39.4531,0.439125,6350.7686,88.60971,7656.213,90.09386
75%,0.673055,0.11,3960.0,49.574917,0.555202,11022.882,90.26528,10000.0,91.63532
max,0.992858,0.11,3960.0,60.7227,1.02212,30853.463,93.193,10000.0,92.79634


### 2.3. Entrenamiento

In [8]:
# Concatena las matrices X y M
X_M = pd.concat([X, M], axis=1)
y = P

display(X_M.head())
display(y.head())

Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
0,51.69,21.32,25.14,3.04,11.26,3.17,18,4,20.32,8.148571,3.925,9.4,21.92,48.841316
1,55.385006,21.566435,31.92032,2.741117,10.541636,2.004243,6,6,20.566435,8.922362,6.36765,9.597938,21.324854,21.83812
2,58.660824,24.61077,17.825636,3.236242,11.840792,2.327503,11,8,23.61077,10.106636,5.184235,10.996324,24.146177,42.16453
3,59.74599,22.251184,21.64142,2.75513,13.803262,3.92936,13,5,21.251184,9.383442,4.944141,9.93681,24.928854,34.987854
4,47.005493,25.875113,11.108705,3.487073,5.880454,2.494527,5,8,24.875113,10.508077,4.684736,11.565787,18.81801,47.85282


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
0,0.546655,0.11,3960.0,51.624046,0.623453,3276.9807,80.95138,5060.14,91.475174
1,0.626095,0.11,3960.0,23.672848,0.370497,11313.051,89.63181,10000.0,87.564896
2,0.536246,0.11,3960.0,45.401558,0.556664,8573.148,88.79658,10000.0,90.32792
3,0.586261,0.11,3960.0,36.230183,0.516459,5139.8037,85.65474,7697.3105,91.06665
4,0.285555,0.11,3960.0,58.693073,0.182915,30853.463,89.73517,10000.0,84.311806


In [9]:
# Escalado de datos
scaler_X = StandardScaler()
X_M_scaled = scaler_X.fit_transform(X_M)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)

In [10]:
# División de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_M_scaled, y_scaled, test_size=0.2, random_state=42)

### 2.4. Modelo subrogado: PLS (Partial Least Squares)

In [11]:
# Determinar número óptimo de componentes usando validación cruzada
mse = []
componentes = np.arange(1, min(len(X.columns), 20))

for n in componentes:
    pls = PLSRegression(n_components=n)
    scores = cross_val_score(pls, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mse.append(-scores.mean())

# Gráfico para identificar el número óptimo de componentes
plt.figure(figsize=(10, 6))
plt.plot(componentes, mse, marker='o')
plt.xlabel('Número de componentes')
plt.ylabel('MSE (validación cruzada)')
plt.title('Selección del número óptimo de componentes para PLS')
plt.grid()
# Guardar la figura en la carpeta 'Figuras_EDA/(La carpeta que corresponda)'
figure_file = os.path.join(figure_path, 'seleccion_componentes_PLS.png')
plt.savefig(figure_file, dpi =1080)
plt.close()
# plt.show()

# Selección óptima de componentes (menor MSE)
n_componentes_optimos = componentes[np.argmin(mse)]
print(f'El número óptimo de componentes para modelar PLS es: {n_componentes_optimos}')

El número óptimo de componentes para modelar PLS es: 7


In [12]:
# Entrenamiento del modelo final PLS con número óptimo de componentes
pls_final = PLSRegression(n_components = n_componentes_optimos)
pls_final.fit(X_train, y_train)
predicciones_test_pls = pls_final.predict(X_test)

### 2.5. Modelo subrogado: LR (Regresión lineal)

In [13]:
# --- Regresión lineal (LR) ---
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
predicciones_test_lr = model_LR.predict(X_test)

### 2.6. Modelo subrogado: Kriging

In [14]:
# Ejemplo demostrativo para Kriging (predicciones ficticias)
'''
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
kernel_prueba = DotProduct() + WhiteKernel()
kriging = MultiOutputRegressor(GaussianProcessRegressor(kernel=kernel_prueba, random_state=0))
'''
# --- KRIGING (GPR MULTISALIDA) ---
#kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
kernel = C(1.0, (1e-4, 1e4)) * RBF(1.0, (1e-3, 1e3)) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-4, 1e2))
kriging = MultiOutputRegressor(
    GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, normalize_y=True)
)
kriging.fit(X_train, y_train)
predicciones_test_kriging = kriging.predict(X_test)

ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


### 2.7. Modelo subrogado: Support Vector Regression (SVR)

In [15]:
# Modelo Support Vector Regression (SVR)
svr = MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
svr.fit(X_train, y_train)
predicciones_test_svr = svr.predict(X_test)

### 2.8. Modelo subrogado: Random Forest (RF)

In [16]:
# Modelo Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
predicciones_test_rf = rf.predict(X_test)

### 2.9. Análisis de resultados y visualización

In [17]:
# Confirmación de ejecución
display(predicciones_test_pls.shape,
        predicciones_test_lr.shape,
        predicciones_test_kriging.shape, 
        predicciones_test_svr.shape, 
        predicciones_test_rf.shape)

(31, 9)

(31, 9)

(31, 9)

(31, 9)

(31, 9)

In [18]:
# Gráfico de dispersión predicciones vs reales
def plot_predictions(y_true, y_pred, model_name, target_names, max_cols=3):
    """
    Visualiza resultados de regresión: reales vs predichos para cada variable de salida.
    Guarda la figura automáticamente con el nombre del modelo.

    Parámetros:
    - y_true, y_pred: arrays 2D (n_samples, n_targets)
    - target_names: lista con nombres de las columnas objetivo
    - model_name: nombre del modelo (str), usado para título y nombre del archivo
    - max_cols: número máximo de columnas en los subplots
    """
    
    n_targets = y_true.shape[1]
    n_cols = min(n_targets, max_cols)
    n_rows = math.ceil(n_targets / n_cols)

    sns.set(style='whitegrid')

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
    axes = axes.flatten()  # Aplanamos para fácil acceso

    for i in range(n_targets):
        ax = axes[i]
        ax.scatter(y_true[:, i], y_pred[:, i], color='steelblue', edgecolor='black', alpha=0.7)
        ax.plot([y_true[:, i].min(), y_true[:, i].max()], 
                [y_true[:, i].min(), y_true[:, i].max()], 
                color='red', linestyle='--', linewidth=1.5)
        ax.set_xlabel("Valor real")
        ax.set_ylabel("Valor predicho")
        ax.set_title(target_names[i], fontsize=11)
        ax.ticklabel_format(style='scientific', axis='both', scilimits=(0, 0))
    
    # Eliminar subplots vacíos si sobran
    for j in range(n_targets, len(axes)):
        fig.delaxes(axes[j])

    fig.suptitle(f'Predicción_vs_ValorReal_Modelo_{model_name}', fontsize=16, y=1.02)
    plt.tight_layout()
    
    # Guardar imagen
    figure_file = os.path.join(figure_path, f'Predicción_vs_ValorReal_Modelo_{model_name}')
    plt.savefig(figure_file, dpi =1080, bbox_inches='tight')
    plt.close()
    # plt.show()

In [19]:
# Llamada a la función con tus datos ya existentes
plot_predictions(y_test, predicciones_test_pls, 'PLS', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_lr, 'LR', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_kriging, 'Kriging', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_svr, 'SVR', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_rf, 'Random_Forest', target_names=P.columns.tolist())

### 2.9. Comparativa entre modelos subrogados.

In [20]:
# Creación de tabla comparativa de métricas
evaluacion_modelos = pd.DataFrame({
    'Modelo': ['PLS', 'LR', 'Kriging', 'SVR', 'Random Forest'],
    'MSE': [
        mean_squared_error(y_test, predicciones_test_pls),
        mean_squared_error(y_test, predicciones_test_lr),
        mean_squared_error(y_test, predicciones_test_kriging),
        mean_squared_error(y_test, predicciones_test_svr),
        mean_squared_error(y_test, predicciones_test_rf)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_test, predicciones_test_pls)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_lr)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_kriging)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_svr)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_rf))
    ],
    'MAE': [
        mean_absolute_error(y_test, predicciones_test_pls),
        mean_absolute_error(y_test, predicciones_test_lr),
        mean_absolute_error(y_test, predicciones_test_kriging),
        mean_absolute_error(y_test, predicciones_test_svr),
        mean_absolute_error(y_test, predicciones_test_rf)
    ],
    'R2': [
        r2_score(y_test, predicciones_test_pls),
        r2_score(y_test, predicciones_test_lr),
        r2_score(y_test, predicciones_test_kriging),
        r2_score(y_test, predicciones_test_svr),
        r2_score(y_test, predicciones_test_rf)
    ]
})

In [21]:
# Mostrar y guardar tabla comparativa
print("\nTabla comparativa de métricas de evaluación:")
print(evaluacion_modelos)
# Ruta al archivo de los modelos
model_file = os.path.join(modelo_path, 'evaluacion_comparativa_modelos.csv')
evaluacion_modelos.to_csv(model_file, index=False)


Tabla comparativa de métricas de evaluación:
          Modelo       MSE      RMSE       MAE        R2
0            PLS  0.109162  0.330397  0.215843  0.889786
1             LR  0.099195  0.314952  0.204405  0.898817
2        Kriging  0.032613  0.180591  0.093898  0.966442
3            SVR  0.121031  0.347895  0.213282  0.881712
4  Random Forest  0.161536  0.401915  0.266798  0.839453


In [22]:
# Estilo visual
sns.set(style='whitegrid', font_scale=1.0)

# Paleta pastel específica para las barras (4 colores)
palette = sns.color_palette("pastel", 5)

# Crear figura con subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Lista de métricas
metrics = ['MSE', 'RMSE', 'MAE', 'R2']
titles = [
    'Error cuadrático medio (MSE)',
    'Raíz del Error cuadrático medio (RMSE)',
    'Error absoluto medio (MAE)',
    'Coeficiente de Determinación (R²)'
]

# Graficar cada métrica
for ax, metric, title in zip(axes.flatten(), metrics, titles):
    sns.barplot(
        data=evaluacion_modelos,
        x='Modelo',
        y=metric,
        hue='Modelo',
        palette=palette,
        ax=ax,
        legend=False  # Quitar leyenda repetida
    )
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel(metric)

# Ajustes finales
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.suptitle('Comparativa científica de métricas entre modelos', fontsize=16, fontweight='bold')
figure_file = os.path.join(figure_path, 'comparativa_r2_modelos.png')
plt.savefig(figure_file, dpi =1080)
plt.close()
#plt.show()