In [1]:
# Librerías necesarias
import os
import re  # Import the regular expression module

import pandas as pd
import numpy as np
import math

import matplotlib
matplotlib.use('TKAgg')
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns

-------------------------------------------------------------------------------------------------------------------------

## 2. Machine Learning

### 2.1. Librerías

In [2]:
import warnings

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.inspection import permutation_importance
from sklearn.exceptions import ConvergenceWarning

### 2.2. Cargar y convertir los datos preprocesados en matrices X, M y P

In [3]:
# Definir las rutas base y de las carpetas
base_path = os.getcwd()  # Se asume que el notebook se ejecuta desde la carpeta 'ML'
db_path = os.path.join(base_path, "DB_ML")
fig_path = os.path.join(base_path, "Figuras_ML")
model_path = os.path.join(base_path, "Modelos_ML")

# Ruta al archivo de la base de datos
data_file = os.path.join(db_path, "design_DB_preprocessed_400_Uniforme.csv")
print(data_file)

# Ruta al archivo de las figuras
figure_path = os.path.join(fig_path, "400_MOT_Uniforme")
print(figure_path)

# Ruta al archivo de los modelos
modelo_path = os.path.join(model_path, "400_MOT_Uniforme")
print(modelo_path)

C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\DB_ML\design_DB_preprocessed_400_Uniforme.csv
C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\Figuras_ML\400_MOT_Uniforme
C:\Users\s00244\Documents\GitHub\MotorDesignDataDriven\Notebooks\2.ML\Modelos_ML\400_MOT_Uniforme


In [4]:
# Lectura del archivo CSV
try:
    df = pd.read_csv(data_file)
    print("Archivo cargado exitosamente.")
except FileNotFoundError:
    print("Error: Archivo no encontrado. Revisa la ruta del archivo.")
except pd.errors.ParserError:
    print("Error: Problema al analizar el archivo CSV. Revisa el formato del archivo.")
except Exception as e:
    print(f"Ocurrió un error inesperado: {e}")

# Función para limpiar nombres de archivo inválidos
def clean_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "_", name)

Archivo cargado exitosamente.


In [5]:
# Separa las columnas en matrices X, M y P
X_cols = [col for col in df.columns if col.startswith('x')]
M_cols = [col for col in df.columns if col.startswith('m')]
P_cols = [col for col in df.columns if col.startswith('p')]

X = df[X_cols].copy()
M = df[M_cols].copy()
P = df[P_cols].copy()

# Transforma todos los datos de X, M y P a numéricos
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

for col in M.columns:
    M[col] = pd.to_numeric(M[col], errors='coerce')

for col in P.columns:
    P[col] = pd.to_numeric(P[col], errors='coerce')

In [6]:
# Mostrar las primeras filas del DataFrame
print("\nPrimeras filas del DataFrame:")
display(X.head())
display(M.head())
display(P.head())


Primeras filas del DataFrame:


Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh
0,59.4,24.056,29.2,2.121244,10.249868,2.569301,12,3
1,54.72,32.0528,22.960001,2.456926,7.797124,2.123813,18,3
2,48.84,21.9616,25.12,3.032072,6.972909,2.557345,14,3
3,59.76,27.1024,29.680002,3.249535,8.141503,4.802138,10,3
4,55.224,31.55776,17.632002,2.836879,7.859266,3.907924,16,3


Unnamed: 0,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
0,23.056,11.940368,7.422132,10.997689,22.277868,22.423594
1,31.0528,16.981005,3.536476,14.912168,23.823524,34.12154
2,20.9616,8.622712,6.466291,9.722782,17.953709,49.140854
3,26.1024,11.859788,8.187296,12.238816,21.692703,34.091393
4,30.55776,15.8675,3.973854,14.56966,23.638145,38.995613


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
0,0.674799,0.11,3960.0,24.67578,0.412852,4913.548,87.07682,5737.1406,88.79988
1,0.535554,0.11,3960.0,42.65237,0.538189,3806.537,83.929474,4325.1235,83.40234
2,0.487619,0.11,3960.0,57.017277,0.38092,5161.0967,87.04031,6293.4336,91.34349
3,0.749844,0.11,3960.0,37.44487,0.429127,4961.4146,89.36369,5615.511,91.807846
4,0.460805,0.11,3960.0,45.69534,0.487779,4143.4717,86.91331,4911.1543,90.34042


In [7]:
# Estadísticas descriptivas del DataFrame
print("\nEstadísticas descriptivas:")
display(X.describe())
display(M.describe())
display(P.describe())


Estadísticas descriptivas:


Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh
count,315.0,315.0,315.0,315.0,315.0,315.0,315.0,315.0
mean,55.565992,26.791846,24.393259,2.713218,8.752433,3.273535,9.625397,5.288889
std,3.433244,4.000464,8.553884,0.437636,2.151207,0.837226,4.229031,1.846594
min,45.08352,21.236557,10.153601,2.010795,5.045013,2.005389,5.0,3.0
25%,53.87568,23.552582,17.101121,2.323603,6.932843,2.503218,6.0,4.0
50%,56.30112,26.17934,24.2656,2.715153,8.554401,3.228375,8.0,5.0
75%,58.29648,29.322464,31.69984,3.0992,10.195278,3.899526,12.0,7.0
max,59.97312,39.082367,39.9136,3.498405,14.653883,4.913541,30.0,9.0


Unnamed: 0,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
count,315.0,315.0,315.0,315.0,315.0,315.0
mean,25.791846,12.710597,5.63464,12.217618,22.148356,37.108624
std,4.000464,2.914776,1.723629,1.993692,2.044583,9.376851
min,20.236557,8.020473,3.50941,9.389298,16.757067,20.004059
25%,22.552582,10.464503,4.192027,10.607269,20.672936,29.226268
50%,25.17934,12.286472,5.227414,11.826285,22.321142,37.4416
75%,28.322464,14.478388,6.622397,13.468353,23.689929,44.912174
max,38.082367,21.391977,11.492725,18.198103,26.055086,54.93674


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
count,315.0,315.0,315.0,315.0,315.0,315.0,315.0,315.0,315.0
mean,0.57721,0.11,3960.0,42.711075,0.483219,8720.246845,88.909854,8029.193061,88.521106
std,0.155096,0.0,0.0,10.763442,0.223525,4995.163473,2.711216,2034.646487,3.108416
min,0.262054,0.11,3960.0,21.384506,0.123103,2770.4639,77.51245,4001.9104,70.23855
25%,0.450948,0.11,3960.0,34.128508,0.331393,5157.05585,87.359907,6189.2632,87.09829
50%,0.56442,0.11,3960.0,43.603416,0.441778,7422.8433,89.436745,8600.202,89.317856
75%,0.685523,0.11,3960.0,51.378313,0.580932,10446.577,91.050338,10000.0,90.71166
max,0.945518,0.11,3960.0,63.25313,1.402811,34588.57,93.09264,10000.0,92.71753


### 2.3. Entrenamiento

In [8]:
# Concatena las matrices X y M
X_M = pd.concat([X, M], axis=1)
y = P

display(X_M.head())
display(y.head())

Unnamed: 0,x1::OSD,x2::Dint,x3::L,x4::tm,x5::hs2,x6::wt,x7::Nt,x8::Nh,m1::Drot,m2::Dsh,m3::he,m4::Rmag,m5::Rs,m6::GFF
0,59.4,24.056,29.2,2.121244,10.249868,2.569301,12,3,23.056,11.940368,7.422132,10.997689,22.277868,22.423594
1,54.72,32.0528,22.960001,2.456926,7.797124,2.123813,18,3,31.0528,16.981005,3.536476,14.912168,23.823524,34.12154
2,48.84,21.9616,25.12,3.032072,6.972909,2.557345,14,3,20.9616,8.622712,6.466291,9.722782,17.953709,49.140854
3,59.76,27.1024,29.680002,3.249535,8.141503,4.802138,10,3,26.1024,11.859788,8.187296,12.238816,21.692703,34.091393
4,55.224,31.55776,17.632002,2.836879,7.859266,3.907924,16,3,30.55776,15.8675,3.973854,14.56966,23.638145,38.995613


Unnamed: 0,p1::W,p2::Tnom,p3::nnom,p4::GFF,p5::BSP_T,p6::BSP_n,p7::BSP_Mu,p8::MSP_n,p9::UWP_Mu
0,0.674799,0.11,3960.0,24.67578,0.412852,4913.548,87.07682,5737.1406,88.79988
1,0.535554,0.11,3960.0,42.65237,0.538189,3806.537,83.929474,4325.1235,83.40234
2,0.487619,0.11,3960.0,57.017277,0.38092,5161.0967,87.04031,6293.4336,91.34349
3,0.749844,0.11,3960.0,37.44487,0.429127,4961.4146,89.36369,5615.511,91.807846
4,0.460805,0.11,3960.0,45.69534,0.487779,4143.4717,86.91331,4911.1543,90.34042


In [9]:
# Escalado de datos. En el modelo de entrenamiento SOLO se usan las variables de entrada X y las de salida P.
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
display(pd.DataFrame(X_scaled).head())

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)
display(pd.DataFrame(y_scaled).head())

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.118507,-0.68497,0.562831,-1.354814,0.697198,-0.84249,0.562394,-1.241492
1,-0.246804,1.317178,-0.167823,-0.586559,-0.444787,-1.375437,1.983416,-1.241492
2,-1.962195,-1.209342,0.085096,0.729742,-0.828538,-0.856794,1.036068,-1.241492
3,1.223531,0.077753,0.619035,1.227435,-0.284446,1.828701,0.08872,-1.241492
4,-0.09977,1.193236,-0.791689,0.283014,-0.415854,0.758933,1.509742,-1.241492


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.630214,0.0,0.0,-1.678273,-0.315307,-0.763289,-0.677169,-1.128304,0.089826
1,-0.269014,0.0,0.0,-0.005463,0.246312,-0.985259,-1.839877,-1.823394,-1.649364
2,-0.578571,0.0,0.0,1.331262,-0.458394,-0.713653,-0.690656,-0.854459,0.909426
3,1.11485,0.0,0.0,-0.490046,-0.242381,-0.753692,0.167659,-1.188178,1.059051
4,-0.751735,0.0,0.0,0.277701,0.02043,-0.917699,-0.737573,-1.53491,0.586218


In [10]:
# División de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

### 2.4. Modelo subrogado: PLS (Partial Least Squares)

In [11]:
# Determinar número óptimo de componentes usando validación cruzada
mse = []
componentes = np.arange(1, min(len(X.columns), 20))

for n in componentes:
    pls = PLSRegression(n_components=n)
    scores = cross_val_score(pls, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mse.append(-scores.mean())

# Gráfico para identificar el número óptimo de componentes
plt.figure(figsize=(10, 6))
plt.plot(componentes, mse, marker='o')
plt.xlabel('Número de componentes')
plt.ylabel('MSE (validación cruzada)')
plt.title('Selección del número óptimo de componentes para PLS')
plt.grid()
# Guardar la figura en la carpeta 'Figuras_EDA/(La carpeta que corresponda)'
figure_file = os.path.join(figure_path, 'seleccion_componentes_PLS.png')
plt.savefig(figure_file, dpi =1080)
plt.close()
# plt.show()

# Selección óptima de componentes (menor MSE)
n_componentes_optimos = componentes[np.argmin(mse)]
print(f'El número óptimo de componentes para modelar PLS es: {n_componentes_optimos}')

El número óptimo de componentes para modelar PLS es: 7


In [12]:
# Entrenamiento del modelo final PLS con número óptimo de componentes
pls_final = PLSRegression(n_components = n_componentes_optimos)
pls_final.fit(X_train, y_train)
predicciones_test_pls = pls_final.predict(X_test)

### 2.5. Modelo subrogado: LR (Regresión lineal)

In [13]:
# --- Regresión lineal (LR) ---
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
predicciones_test_lr = model_LR.predict(X_test)

### 2.6. Modelo subrogado: Kriging

In [14]:
# Ejemplo demostrativo para Kriging (predicciones ficticias)
'''
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
kernel_prueba = DotProduct() + WhiteKernel()
kriging = MultiOutputRegressor(GaussianProcessRegressor(kernel=kernel_prueba, random_state=0))
'''
# --- KRIGING (GPR MULTISALIDA) ---
#kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
kernel = C(1.0, (1e-4, 1e4)) * RBF(1.0, (1e-3, 1e3)) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-4, 1e2))
kriging = MultiOutputRegressor(
    GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, normalize_y=True)
)
kriging.fit(X_train, y_train)
predicciones_test_kriging = kriging.predict(X_test)

ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


### 2.7. Modelo subrogado: Support Vector Regression (SVR)

In [15]:
# Modelo Support Vector Regression (SVR)
svr = MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
svr.fit(X_train, y_train)
predicciones_test_svr = svr.predict(X_test)

### 2.8. Modelo subrogado: Random Forest (RF)

In [16]:
# Modelo Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
predicciones_test_rf = rf.predict(X_test)

### 2.9. Análisis de resultados y visualización

In [17]:
# Confirmación de ejecución
display(predicciones_test_pls.shape,
        predicciones_test_lr.shape,
        predicciones_test_kriging.shape, 
        predicciones_test_svr.shape, 
        predicciones_test_rf.shape)

(63, 9)

(63, 9)

(63, 9)

(63, 9)

(63, 9)

In [18]:
# Gráfico de dispersión predicciones vs reales
def plot_predictions(y_true, y_pred, model_name, target_names, max_cols=3):
    """
    Visualiza resultados de regresión: reales vs predichos para cada variable de salida.
    Guarda la figura automáticamente con el nombre del modelo.

    Parámetros:
    - y_true, y_pred: arrays 2D (n_samples, n_targets)
    - target_names: lista con nombres de las columnas objetivo
    - model_name: nombre del modelo (str), usado para título y nombre del archivo
    - max_cols: número máximo de columnas en los subplots
    """
    
    n_targets = y_true.shape[1]
    n_cols = min(n_targets, max_cols)
    n_rows = math.ceil(n_targets / n_cols)

    sns.set(style='whitegrid')

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
    axes = axes.flatten()  # Aplanamos para fácil acceso

    for i in range(n_targets):
        ax = axes[i]
        ax.scatter(y_true[:, i], y_pred[:, i], color='steelblue', edgecolor='black', alpha=0.7)
        ax.plot([y_true[:, i].min(), y_true[:, i].max()], 
                [y_true[:, i].min(), y_true[:, i].max()], 
                color='red', linestyle='--', linewidth=1.5)
        ax.set_xlabel("Valor real")
        ax.set_ylabel("Valor predicho")
        ax.set_title(target_names[i], fontsize=11)
        ax.ticklabel_format(style='scientific', axis='both', scilimits=(0, 0))
    
    # Eliminar subplots vacíos si sobran
    for j in range(n_targets, len(axes)):
        fig.delaxes(axes[j])

    fig.suptitle(f'Predicción_vs_ValorReal_Modelo_{model_name}', fontsize=16, y=1.02)
    plt.tight_layout()
    
    # Guardar imagen
    figure_file = os.path.join(figure_path, f'Predicción_vs_ValorReal_Modelo_{model_name}')
    plt.savefig(figure_file, dpi =1080, bbox_inches='tight')
    plt.close()
    # plt.show()

In [19]:
# Llamada a la función con tus datos ya existentes
plot_predictions(y_test, predicciones_test_pls, 'PLS', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_lr, 'LR', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_kriging, 'Kriging', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_svr, 'SVR', target_names=P.columns.tolist())
plot_predictions(y_test, predicciones_test_rf, 'Random_Forest', target_names=P.columns.tolist())

### 2.9. Comparativa entre modelos subrogados.

In [20]:
# Creación de tabla comparativa de métricas
evaluacion_modelos = pd.DataFrame({
    'Modelo': ['PLS', 'LR', 'Kriging', 'SVR', 'Random Forest'],
    'MSE': [
        mean_squared_error(y_test, predicciones_test_pls),
        mean_squared_error(y_test, predicciones_test_lr),
        mean_squared_error(y_test, predicciones_test_kriging),
        mean_squared_error(y_test, predicciones_test_svr),
        mean_squared_error(y_test, predicciones_test_rf)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_test, predicciones_test_pls)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_lr)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_kriging)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_svr)),
        np.sqrt(mean_squared_error(y_test, predicciones_test_rf))
    ],
    'MAE': [
        mean_absolute_error(y_test, predicciones_test_pls),
        mean_absolute_error(y_test, predicciones_test_lr),
        mean_absolute_error(y_test, predicciones_test_kriging),
        mean_absolute_error(y_test, predicciones_test_svr),
        mean_absolute_error(y_test, predicciones_test_rf)
    ],
    'R2': [
        r2_score(y_test, predicciones_test_pls),
        r2_score(y_test, predicciones_test_lr),
        r2_score(y_test, predicciones_test_kriging),
        r2_score(y_test, predicciones_test_svr),
        r2_score(y_test, predicciones_test_rf)
    ]
})

In [21]:
# Mostrar y guardar tabla comparativa
print("\nTabla comparativa de métricas de evaluación:")
print(evaluacion_modelos)
# Ruta al archivo de los modelos
model_file = os.path.join(modelo_path, 'evaluacion_comparativa_modelos.csv')
evaluacion_modelos.to_csv(model_file, index=False)


Tabla comparativa de métricas de evaluación:
          Modelo       MSE      RMSE       MAE        R2
0            PLS  0.170943  0.413453  0.262652  0.826796
1             LR  0.171390  0.413993  0.262509  0.826334
2        Kriging  0.023520  0.153362  0.082868  0.975229
3            SVR  0.124596  0.352981  0.199394  0.875498
4  Random Forest  0.226633  0.476060  0.314072  0.778343


In [22]:
# Estilo visual
sns.set(style='whitegrid', font_scale=1.0)

# Paleta pastel específica para las barras (4 colores)
palette = sns.color_palette("pastel", 5)

# Crear figura con subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Lista de métricas
metrics = ['MSE', 'RMSE', 'MAE', 'R2']
titles = [
    'Error cuadrático medio (MSE)',
    'Raíz del Error cuadrático medio (RMSE)',
    'Error absoluto medio (MAE)',
    'Coeficiente de Determinación (R²)'
]

# Graficar cada métrica
for ax, metric, title in zip(axes.flatten(), metrics, titles):
    sns.barplot(
        data=evaluacion_modelos,
        x='Modelo',
        y=metric,
        hue='Modelo',
        palette=palette,
        ax=ax,
        legend=False  # Quitar leyenda repetida
    )
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel(metric)

# Ajustes finales
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.suptitle('Comparativa científica de métricas entre modelos', fontsize=16, fontweight='bold')
figure_file = os.path.join(figure_path, 'comparativa_r2_modelos.png')
plt.savefig(figure_file, dpi =1080)
plt.close()
#plt.show()