## Libraries

In [None]:
import os
import re
from itertools import combinations

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from matplotlib.patches import Ellipse

import plotly.express as px
import plotly.graph_objects as go

from scipy import stats
from scipy.io import loadmat, whosmat
from scipy.spatial.distance import pdist, squareform, cdist
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as sch
from scipy.linalg import inv

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import mutual_info_classif

import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from statsmodels.multivariate.manova import MANOVA

import pywt

import pycaret.classification as pyc
from pycaret.classification import *

import src
from src import config, loadmatNina
from src.preprocessing_utils import get_envelope


In [None]:
summary_by_relabeled_200 = pd.read_csv("metrics_avg_by_repetition_tesis_3.csv")
#summary_by_relabeled_200.drop
display(summary_by_relabeled_200)
print(summary_by_relabeled_200.columns)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# # Separar columnas que no se deben normalizar
# non_normalized_cols = ['subject', 'relabeled', 're_repetition']
# columns_to_normalize = [col for col in summary_by_relabeled_200.columns if col not in non_normalized_cols]

# # Inicializar el escalador
# scaler = MinMaxScaler()

# # Aplicar MinMaxScaler solo a las columnas deseadas
# normalized_data = scaler.fit_transform(summary_by_relabeled_200[columns_to_normalize])

# # Combinar columnas no normalizadas con las normalizadas
# pivoted_normalized = pd.concat(
#     [summary_by_relabeled_200[non_normalized_cols].reset_index(drop=True),
#      pd.DataFrame(normalized_data, columns=columns_to_normalize)],
#     axis=1
# )

In [None]:
cols_to_drop = ['Channel 1_window_number', 'Channel 2_window_number', 'Channel 3_window_number', 'Channel 4_window_number','Channel 5_window_number', 'Channel 6_window_number', 'Channel 7_window_number', 'Channel 8_window_number','Channel 9_window_number', 'Channel 10_window_number', 'Channel 11_window_number', 'Channel 12_window_number','Channel 1_SSC',
       'Channel 10_SSC', 'Channel 2_SSC', 'Channel 3_SSC', 'Channel 4_SSC',
       'Channel 5_SSC', 'Channel 6_SSC', 'Channel 7_SSC', 'Channel 8_SSC',
       'Channel 9_SSC']
summary_by_relabeled_200 = summary_by_relabeled_200.drop(columns=cols_to_drop, errors='ignore')
print(summary_by_relabeled_200.columns.tolist())
display(summary_by_relabeled_200)

In [None]:
print(summary_by_relabeled_200['relabeled'].value_counts(normalize=True))  # proporciones
print(summary_by_relabeled_200['relabeled'].value_counts())  # conteo absoluto


In [None]:
# Copiar el dataframe
df = summary_by_relabeled_200.copy()

# Identificar columnas de métricas (todas excepto 'subject', 'relabeled' y 're_repetition')
metric_columns = [col for col in df.columns if col not in ['subject', 'relabeled', 're_repetition']]

# Escalar las métricas
scaler = MinMaxScaler()
df[metric_columns] = scaler.fit_transform(df[metric_columns])

# Número de métricas
n_metrics = len(metric_columns)

# Crear figura para subplots
fig, axes = plt.subplots(nrows=(n_metrics + 1) // 2, ncols=2, figsize=(14, 3 * ((n_metrics + 1) // 2)))
axes = axes.flatten()

# Crear boxplots por métrica
for i, metric in enumerate(metric_columns):
    if i < len(axes):
        sns.boxplot(x='relabeled', y=metric, data=df, ax=axes[i], palette='viridis')
        axes[i].set_title(f'{metric} por grasp', fontsize=14)
        axes[i].set_xlabel('Grasp')
        axes[i].set_ylabel('Valor normalizado')
        if df['relabeled'].nunique() > 5:
            axes[i].tick_params(axis='x', rotation=45)

# Eliminar subplots vacíos
for i in range(n_metrics, len(axes)):
    fig.delaxes(axes[i])

# Título general y ajuste del espacio
plt.suptitle('Comparación de métricas EMG por grasp', fontsize=16)
plt.subplots_adjust(top=0.96, hspace=0.6)
plt.show()


In [None]:
summary_by_relabeled_200

In [None]:
# Copiar el dataframe
df = summary_by_relabeled_200.copy()

# Identificar columnas de métricas (todas excepto 'subject', 'relabeled' y 're_repetition')
metric_columns = [col for col in df.columns if col not in ['subject', 'relabeled', 're_repetition']]

# Escalar las métricas
scaler = MinMaxScaler()
df[metric_columns] = scaler.fit_transform(df[metric_columns])

# Número de métricas
n_metrics = len(metric_columns)

# Crear figura para subplots
fig, axes = plt.subplots(nrows=(n_metrics + 1) // 2, ncols=2, figsize=(14, 3 * ((n_metrics + 1) // 2)))
axes = axes.flatten()

# Crear violin plots por métrica
for i, metric in enumerate(metric_columns):
    if i < len(axes):
        sns.violinplot(x='relabeled', y=metric, data=df, ax=axes[i], palette='viridis', inner='box')
        axes[i].set_title(f'{metric} por grasp', fontsize=14)
        axes[i].set_xlabel('Grasp')
        axes[i].set_ylabel('Valor normalizado')
        if df['relabeled'].nunique() > 5:
            axes[i].tick_params(axis='x', rotation=45)

# Eliminar subplots vacíos
for i in range(n_metrics, len(axes)):
    fig.delaxes(axes[i])

# Título general y ajuste del espacio
plt.suptitle('Distribución de métricas EMG por grasp (Violin Plots)', fontsize=16)
plt.subplots_adjust(top=0.96, hspace=0.6)
plt.show()


## Feature Importance

In [None]:
# Copiar el DataFrame
df = summary_by_relabeled_200.copy()

# Variables predictoras (excluyendo 'subject', 'relabeled', y 're_repetition')
X = df.drop(columns=['subject', 'relabeled', 're_repetition'])
y = df['relabeled']

# Calcular la información mutua
mi = mutual_info_classif(X, y, discrete_features=False, random_state=42)

# Crear DataFrame de importancia
mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi})
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

# Visualizar
plt.figure(figsize=(10, 50))
sns.barplot(x='Mutual Information', y='Feature', data=mi_df, palette='magma')
plt.title('Importancia de características EMG según Información Mutua con grasp', fontsize=14)
plt.xlabel('Información Mutua')
plt.ylabel('Característica')
plt.tight_layout()
plt.show()


In [None]:
# Copiar DataFrame original
df = summary_by_relabeled_200.copy()

# Variables predictoras y target
X = df.drop(columns=['subject', 'relabeled', 're_repetition'])
y = df['relabeled']

# Calcular información mutua
mi = mutual_info_classif(X, y, discrete_features=False, random_state=42)

# Crear DataFrame con la MI
mi_df = pd.DataFrame({'FullName': X.columns, 'Mutual Information': mi})

# Separar en "Channel" y "Feature" 
mi_df[['Channel', 'Feature']] = mi_df['FullName'].str.extract(r'(Channel \d+)_(.*)')

# Crear tabla tipo matriz (canales como filas, features como columnas)
heatmap_df = mi_df.pivot(index='Channel', columns='Feature', values='Mutual Information')

# Ordenar los canales numéricamente
heatmap_df.index = heatmap_df.index.str.extract(r'Channel (\d+)')[0].astype(int)
heatmap_df = heatmap_df.sort_index()
heatmap_df.index = [f'Channel {i}' for i in heatmap_df.index]

# Visualizar heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_df, cmap='YlOrRd', annot=False, linewidths=0.5, linecolor='black', cbar_kws={'label': 'Mutual Information'})
plt.title('Heatmap de Importancia de Características EMG\n(mutual_info_classif)', fontsize=16)
plt.xlabel('Característica')
plt.ylabel('Canal EMG')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Copiar el DataFrame
df = summary_by_relabeled_200.copy()

# Variables predictoras (excluyendo 'subject', 'relabeled' y 're_repetition')
X = df.drop(columns=['subject', 'relabeled', 're_repetition'])
y = df['relabeled']

# Entrenar árbol de decisión
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X, y)

# Obtener importancias
importances = tree_clf.feature_importances_
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Visualizar
plt.figure(figsize=(10, 50))
sns.barplot(x='Importance', y='Feature', data=features_df, palette='crest')
plt.title('Importancia de características EMG según Árbol de Decisión', fontsize=14)
plt.xlabel('Importancia')
plt.ylabel('Característica')
plt.tight_layout()
plt.show()


In [None]:
# Copiar DataFrame
df = summary_by_relabeled_200.copy()

# Variables predictoras y etiquetas
X = df.drop(columns=['subject', 'relabeled', 're_repetition'])
y = df['relabeled']

# Entrenar árbol de decisión
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X, y)

# Obtener importancias
importances = tree_clf.feature_importances_

# Crear DataFrame con nombres completos y valores de importancia
importance_df = pd.DataFrame({'FullName': X.columns, 'Importance': importances})

# Extraer nombre del canal y característica (ej: 'Channel 1', 'RMS')
importance_df[['Channel', 'Feature']] = importance_df['FullName'].str.extract(r'(Channel \d+)_(.*)')

# Pivotear para crear matriz (filas=canales, columnas=features)
heatmap_df = importance_df.pivot(index='Channel', columns='Feature', values='Importance')

# Ordenar canales numéricamente
heatmap_df.index = heatmap_df.index.str.extract(r'Channel (\d+)')[0].astype(int)
heatmap_df = heatmap_df.sort_index()
heatmap_df.index = [f'Channel {i}' for i in heatmap_df.index]

# Visualización tipo heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_df, cmap='crest', annot=False, linewidths=0.5, linecolor='black', cbar_kws={'label': 'Feature Importance'})
plt.title('Heatmap de Importancia de Características EMG\n(Árbol de Decisión)', fontsize=16)
plt.xlabel('Característica')
plt.ylabel('Canal EMG')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# KFold config
kf = KFold(n_splits=7, shuffle=True, random_state=42)

# 1. Seleccionar métricas (excluir ZC, ZC_STD, Kurt, Kurt_STD)
excluded = ['ZC', 'ZC_STD', 'Kurt', 'Kurt_STD']
features = [c for c in summary_by_relabeled_200.columns 
            if c not in ['subject', 'relabeled', 'stimulus','re_repetition'] 
            and not any(exc.upper() in c.upper() for exc in excluded)]

X = summary_by_relabeled_200[features].values
y = summary_by_relabeled_200['relabeled'].values

# 2. Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Determinar número óptimo de componentes usando varianza explicada acumulada
pca_full = PCA()
pca_full.fit(X_scaled)
explained_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Graficar varianza explicada
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o')
plt.xlabel('Número de componentes principales')
plt.ylabel('Varianza explicada acumulada')
plt.title('Selección del número óptimo de componentes en PCA')
plt.grid(True)
plt.axhline(0.90, color='r', linestyle='--', label='90% de varianza explicada')
plt.legend()
plt.tight_layout()
plt.show()

# 4. Aplicar PCA con un número fijo de componentes (puedes ajustar este número según la curva)
n_components = np.argmax(explained_variance >= 0.90) + 1  # Componentes que explican >=90% varianza
print(f"✅ Número de componentes seleccionados: {n_components}")

pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# 5. Carga de cada feature en cada componente
loadings = pd.DataFrame(
    pca.components_.T,
    index=features,
    columns=[f'PC{i+1}' for i in range(n_components)]
)

# Mostrar los features más influyentes en PC1 y PC2
print("Top 5 features por carga absoluta en PC1:")
display(loadings['PC1'].abs().sort_values(ascending=False).head(5))

if n_components >= 2:
    print("Top 5 features por carga absoluta en PC2:")
    display(loadings['PC2'].abs().sort_values(ascending=False).head(5))

# 6. Visualización PCA 2D si hay al menos 2 componentes
if n_components >= 2:
    pca_df = pd.DataFrame(X_pca[:, :2], columns=['PC1', 'PC2'])
    pca_df['relabeled'] = y
    pca_df['subject'] = summary_by_relabeled_200['subject'].values

    fig = px.scatter(
        pca_df, x='PC1', y='PC2',
        color='relabeled', symbol='subject',
        title='Proyección PCA (PC1 vs PC2)',
        hover_data=['subject', 'relabeled']
    )

    # ✅ Leyenda mejor distribuida y desplazable
    fig.update_layout(
    width=1000,
    height=950,
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.1,  # Debajo del gráfico
        xanchor='center',
        x=0.5,
        font=dict(size=15)
    )
)
    fig.show()


In [None]:
loadings.abs().sum(axis=1).sort_values(ascending=False).head(10)

In [None]:
# Gráfico para PC1
pc1_abs_loadings = loadings['PC1'].abs().sort_values(ascending=False)
fig_pc1 = px.bar(
    pc1_abs_loadings,  # Top 10 features
    x=pc1_abs_loadings.index,
    y=pc1_abs_loadings.values,
    labels={'x': 'Feature', 'y': 'Carga absoluta'},
    title='Cargas absolutas en PC1'
)
fig_pc1.update_layout(xaxis_tickangle=-45)
fig_pc1.show()

# Gráfico para PC2
pc2_abs_loadings = loadings['PC2'].abs().sort_values(ascending=False)
fig_pc2 = px.bar(
    pc2_abs_loadings,
    x=pc2_abs_loadings.index,
    y=pc2_abs_loadings.values,
    labels={'x': 'Feature', 'y': 'Carga absoluta'},
    title='Cargas absolutas en PC2'
)
fig_pc2.update_layout(xaxis_tickangle=-45)
fig_pc2.show()

pc3_abs_loadings = loadings['PC3'].abs().sort_values(ascending=False)
fig_pc3 = px.bar(
    pc3_abs_loadings,
    x=pc3_abs_loadings.index,
    y=pc3_abs_loadings.values,
    labels={'x': 'Feature', 'y': 'Carga absoluta'},
    title='Cargas absolutas en PC3'
)

fig_pc3.update_layout(xaxis_tickangle=-45)
fig_pc3.show()

pc4_abs_loadings = loadings['PC4'].abs().sort_values(ascending=False)
fig_pc4 = px.bar(
    pc4_abs_loadings,
    x=pc4_abs_loadings.index,
    y=pc4_abs_loadings.values,
    labels={'x': 'Feature', 'y': 'Carga absoluta'},
    title='Cargas absolutas en PC4'
)

fig_pc4.update_layout(xaxis_tickangle=-45)
fig_pc4.show()

pc5_abs_loadings = loadings['PC5'].abs().sort_values(ascending=False)
fig_pc5 = px.bar(
    pc5_abs_loadings,
    x=pc5_abs_loadings.index,
    y=pc5_abs_loadings.values,
    labels={'x': 'Feature', 'y': 'Carga absoluta'},
    title='Cargas absolutas en PC5'
)

fig_pc5.update_layout(xaxis_tickangle=-45)
fig_pc5.show()

In [None]:
# Cálculo de importancia global por feature: suma ponderada de cargas absolutas por varianza explicada
explained_var = pca.explained_variance_ratio_[:n_components]  # vector (PC1, PC2, ..., PCn)

# Multiplicar cada carga absoluta por la varianza explicada
weighted_loadings = loadings.abs().values * explained_var  # matriz (features x PCs)

# Sumar por filas para obtener importancia total por feature
global_importance = weighted_loadings.sum(axis=1)

# Crear DataFrame ordenado
importance_df = pd.DataFrame({
    'Feature': loadings.index,
    'Importance': global_importance
}).sort_values(by='Importance', ascending=False)

# Mostrar top 10
print("Top 10 features por importancia global (PCA):")
display(importance_df)

In [None]:
# 1. Separar canal y métrica con expresión regular
importance_df[['Channel', 'Feature']] = (
    importance_df['Feature']
    .str.extract(r'^(Channel \d+)[ _-]*(.+)$')   # admite separador _, espacio o -
)

# 2. Eliminar filas donde no se haya podido extraer canal o feature
bad_rows = importance_df[importance_df['Channel'].isna() | importance_df['Feature'].isna()]


importance_df = importance_df.dropna(subset=['Channel', 'Feature'])

# 3. Pivotar a matriz canales × features
heatmap_df = importance_df.pivot(index='Channel',
                                 columns='Feature',
                                 values='Importance')

# 4. Ordenar los canales por número 
heatmap_df = (
    heatmap_df
    .assign(_n = heatmap_df.index.str.extract(r'(\d+)').astype(int))
    .sort_values('_n')
    .drop(columns='_n')
)

# 5. Dibujar heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(
    heatmap_df,
    cmap='viridis',
    linewidths=.5,
    linecolor='black',
    cbar_kws={'label': 'Importancia global (PCA)'}
)
plt.title('Importancia de características EMG – PCA ponderada', fontsize=16)
plt.xlabel('Métrica')
plt.ylabel('Canal')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
explained_var = pca.explained_variance_ratio_
for i, var in enumerate(explained_var, start=1):
    print(f'PC{i}: {var:.4f} varianza explicada ({var*100:.2f}%)')


## Clasificadores

- PCA

In [None]:
clf = LogisticRegression(max_iter=1000, random_state=42)
scores = cross_val_score(clf, X_pca, y, cv=kf, scoring='accuracy')
print(f"\nAccuracy 5-fold CV en espacio PCA ({n_components} componentes): {scores.mean():.3f} ± {scores.std():.3f}")


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf_model, X_pca, y, cv=kf)

print(f"🎯 Accuracy 5-fold CV con Random Forest: {rf_scores.mean():.3f} ± {rf_scores.std():.3f}")


In [None]:
# Crear el modelo KNN
knn_model = KNeighborsClassifier(n_neighbors=5)

# Evaluación utilizando validación cruzada
knn_scores = cross_val_score(knn_model, X_pca, y, cv=kf)

# Mostrar resultados
print(f"🎯 Accuracy 5-fold CV con KNN: {knn_scores.mean():.3f} ± {knn_scores.std():.3f}")

In [None]:
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_scores = cross_val_score(svm_model, X_pca, y, cv=kf)

print(f"🎯 Accuracy 5-fold CV con SVM (RBF kernel): {svm_scores.mean():.3f} ± {svm_scores.std():.3f}")

In [None]:
# Codificar etiquetas
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Clasificador XGBoost
xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

# Validación cruzada 5-fold
xgb_scores = cross_val_score(xgb_clf, X_pca, y_encoded, cv=kf, scoring='accuracy')

# Mostrar resultados
print(f"🎯 Accuracy 5-fold CV con XGBoost: {xgb_scores.mean():.3f} ± {xgb_scores.std():.3f}")


In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Red neuronal
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(100,),  # una capa oculta con 100 neuronas
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)

# Validación cruzada 5-fold
mlp_scores = cross_val_score(mlp_clf, X_pca, y_encoded, cv=kf, scoring='accuracy')

# Mostrar resultados
print(f"🧠 Accuracy 5-fold CV con MLPClassifier: {mlp_scores.mean():.3f} ± {mlp_scores.std():.3f}")

In [None]:
# Definir el clasificador con los parámetros del modelo de PyCaret
model = LGBMClassifier(
    boosting_type='gbdt',
    class_weight=None,
    colsample_bytree=1.0,
    importance_type='split',
    learning_rate=0.1,
    max_depth=-1,
    min_child_samples=20,
    min_child_weight=0.001,
    min_split_gain=0.0,
    n_estimators=100,
    n_jobs=-1,
    num_leaves=31,
    random_state=42,
    reg_alpha=0.0,
    reg_lambda=0.0,
    subsample=1.0,
    subsample_for_bin=200000,
    subsample_freq=0
)

# Usar el mismo esquema de validación cruzada que PyCaret
kf = KFold(n_splits=7, shuffle=True, random_state=42)

# Evaluación
scores = cross_val_score(model, X_pca, y, cv=kf, scoring='accuracy')

print(f"LGBMClassifier Accuracy 7-fold CV: {scores.mean():.3f} ± {scores.std():.3f}")

In [None]:
# Crear el entorno de pycaret
data = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
data['relabeled'] = y

# Iniciar la configuración de PyCaret
clf = setup(
    data=data, 
    target='relabeled', 
    session_id=42, 
    fold=7,  
    normalize=False, 
    feature_selection=True, 
    pca=False,
    fold_shuffle=True, 
    verbose=True   
)

best_model = compare_models(sort='Accuracy')


if best_model is not None:
    tuned_model = tune_model(best_model, optimize='Accuracy', n_iter=50)

    # Finalizar 
    final_model = finalize_model(tuned_model)

    # Evaluar el modelo final
    evaluate_model(final_model)
else:
    print("No se encontró un modelo válido para tunear.")


- Without PCA

In [None]:
from pycaret.classification import *

# Asegurar que la columna target sea tipo string
summary_by_relabeled_200['relabeled'] = summary_by_relabeled_200['relabeled'].astype(str)

# Configurar PyCaret
clf = setup(
    data=summary_by_relabeled_200,
    target='relabeled',
    session_id=42,
    fold=7,
    fold_shuffle=True,
    normalize=True,
    feature_selection=False,
    pca=False,
    data_split_stratify=True,
    #silent=True,
    use_gpu=False  # cambia a True si tienes GPU y quieres usarla
)

# Comparar modelos incluyendo el MLP
models_to_compare = compare_models(
    sort='Accuracy',
    include=['mlp', 'lr', 'rf', 'xgboost', 'lightgbm', 'dt', 'knn', 'et', 'svm']  # puedes agregar o quitar modelos aquí
)

# Seleccionar el mejor modelo
if models_to_compare is not None:
    best_model = models_to_compare
    tuned_model = tune_model(best_model, optimize='Accuracy', n_iter=80)
    final_model = finalize_model(tuned_model)
    evaluate_model(final_model)
else:
    print("No se encontró un modelo válido para tunear.")



## Independent Models

- Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Pipeline
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        max_iter=1000,
        random_state=42,
        multi_class='multinomial',
        solver='lbfgs'
    )
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje (accuracy vs. tamaño del set de entrenamiento)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=cv,
    scoring='accuracy',
    shuffle=True,
    random_state=42,
    n_jobs=-1
)

# Calcular media y desviación estándar
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Graficar curva de aprendizaje
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Accuracy Entrenamiento')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', color='green', label='Accuracy Validación')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de Aprendizaje - Logistic Regression')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix, log_loss
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Preparar datos
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)
classes = np.unique(y)

# 2. Configuración
max_epochs = 500
patience = 3
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# Almacenar resultados globales
all_y_true = []
all_y_pred = []

# Curvas por fold
all_train_acc = []
all_val_acc = []

# 3. Cross-validation manual con early stopping
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    print(f"\n🔁 Fold {fold}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Escalar
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Modelo SGD
    model = SGDClassifier(
        loss='log_loss',  # equivalente a regresión logística
        max_iter=1,       # solo una iteración por .fit
        tol=None,
        learning_rate='optimal',
        random_state=42
    )

    best_val_loss = np.inf
    patience_counter = 0
    train_acc_history = []
    val_acc_history = []

    for epoch in range(max_epochs):
        model.partial_fit(X_train_scaled, y_train, classes=classes)

        # Accuracy
        y_train_pred = model.predict(X_train_scaled)
        y_val_pred = model.predict(X_val_scaled)
        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        train_acc_history.append(train_acc)
        val_acc_history.append(val_acc)

        # Early stopping basado en validation loss
        val_loss = log_loss(y_val, model.predict_proba(X_val_scaled), labels=classes)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"⏹️ Early stopping en epoch {epoch+1}")
                break

    # Guardar resultados globales
    y_pred_fold = best_model.predict(X_val_scaled)
    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred_fold)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred_fold, labels=classes)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

    # Guardar curvas
    all_train_acc.append(train_acc_history)
    all_val_acc.append(val_acc_history)

# 4. Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=classes)
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=classes, yticklabels=classes)
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 5. Curva de accuracy vs. epoch (promediada)
max_len = max(len(acc) for acc in all_val_acc)
train_acc_array = np.array([np.pad(acc, (0, max_len - len(acc)), constant_values=np.nan) for acc in all_train_acc])
val_acc_array = np.array([np.pad(acc, (0, max_len - len(acc)), constant_values=np.nan) for acc in all_val_acc])

mean_train = np.nanmean(train_acc_array, axis=0)
mean_val = np.nanmean(val_acc_array, axis=0)

plt.figure(figsize=(8, 6))
plt.plot(range(1, max_len + 1), mean_train, label='Accuracy Entrenamiento', color='blue')
plt.plot(range(1, max_len + 1), mean_val, label='Accuracy Validación', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Curva de Accuracy vs. Epoch (con early stopping)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 6. Métricas globales (macro)
accuracy = accuracy_score(all_y_true, all_y_pred)
f1 = f1_score(all_y_true, all_y_pred, average='macro')
recall = recall_score(all_y_true, all_y_pred, average='macro')
precision = precision_score(all_y_true, all_y_pred, average='macro')
kappa = cohen_kappa_score(all_y_true, all_y_pred)
mcc = matthews_corrcoef(all_y_true, all_y_pred)

# Mostrar resultados
print("\n📊 Métricas globales (macro):")
print(f"Accuracy          : {accuracy:.4f}")
print(f"F1 Score (macro)  : {f1:.4f}")
print(f"Recall (macro)    : {recall:.4f}")
print(f"Precision (macro) : {precision:.4f}")
print(f"Cohen's Kappa     : {kappa:.4f}")
print(f"Matthews CorrCoef : {mcc:.4f}")



- Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Crear pipeline
model = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Definir métricas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación por validación cruzada
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ ExtraTreesClassifier - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# 7. Matriz de confusión total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 8. Curva de aprendizaje: accuracy vs número de muestras (tamaño del set de entrenamiento)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1, shuffle=True, random_state=42
)

# Calcular promedio y desviación
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Graficar la curva de aprendizaje
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', label="Accuracy Entrenamiento", color='blue')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', label="Accuracy Validación", color='green')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de Aprendizaje - ExtraTreesClassifier')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Crear pipeline
model = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier(min_samples_split=5, min_samples_leaf=3)

)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Definir métricas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación por validación cruzada
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ ExtraTreesClassifier - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# 7. Matriz de confusión total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 8. Curva de aprendizaje: accuracy vs número de muestras (tamaño del set de entrenamiento)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1, shuffle=True, random_state=42
)

# Calcular promedio y desviación
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Graficar la curva de aprendizaje
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', label="Accuracy Entrenamiento", color='blue')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', label="Accuracy Validación", color='green')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de Aprendizaje - ExtraTreesClassifier')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()



- SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Pipeline
model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', probability=True, random_state=42)
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ SVM - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje
train_sizes, train_scores, val_scores = learning_curve(
    estimator=model,
    X=X,
    y=y,
    cv=cv,
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10),
    n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_scores_mean, 'o-', label='Entrenamiento', color='blue')
plt.plot(train_sizes, val_scores_mean, 's--', label='Validación', color='green')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Accuracy')
plt.title('Curva de aprendizaje - SVM (kernel RBF)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=0.5, gamma=0.01, probability=True, random_state=42)
)


# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ SVM - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje
train_sizes, train_scores, val_scores = learning_curve(
    estimator=model,
    X=X,
    y=y,
    cv=cv,
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10),
    n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_scores_mean, 'o-', label='Entrenamiento', color='blue')
plt.plot(train_sizes, val_scores_mean, 's--', label='Validación', color='green')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Accuracy')
plt.title('Curva de aprendizaje - SVM (kernel RBF)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

- KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Pipeline
model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=5)
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ KNN - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje (accuracy vs training size)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), shuffle=True, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Precisión en entrenamiento')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', color='green', label='Precisión en validación')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de aprendizaje - KNN')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Precisión')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, recall_score, precision_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Pipeline
model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(n_neighbors=3)
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ KNN - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje (accuracy vs training size)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), shuffle=True, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Precisión en entrenamiento')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', color='green', label='Precisión en validación')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de aprendizaje - KNN')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Precisión')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()


- RNA

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, matthews_corrcoef,
    confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Pipeline con MLPClassifier
model = make_pipeline(
    StandardScaler(),
    MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
)

# 3. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 4. Métricas personalizadas
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro',
    'precision_macro': 'precision_macro',
    'kappa': make_scorer(cohen_kappa_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# 5. Evaluación de métricas
results = cross_validate(
    model, X, y, cv=cv, scoring=scoring, return_train_score=False
)

print("✅ RNA (MLPClassifier) - Métricas promedio en validación cruzada (7 folds):")
for metric, values in results.items():
    if "test" in metric:
        print(f"{metric.replace('test_', '').upper():<15}: {np.mean(values):.4f}")

# 6. Matrices de confusión por fold y global
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Matriz de confusión por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

# Matriz de confusión total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje (accuracy vs training size)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), shuffle=True, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Precisión en entrenamiento')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', color='green', label='Precisión en validación')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de aprendizaje - MLPClassifier')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Precisión')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, matthews_corrcoef,
    confusion_matrix, accuracy_score
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Definir X e y
X = summary_by_relabeled_200.drop(columns=['relabeled', 'subject', 're_repetition'])
y = summary_by_relabeled_200['relabeled'].astype(str)

# 2. Validación cruzada
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 3. Matrices de confusión y curvas por fold
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    print(f"\n🔁 Fold {fold}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # 4. Modelo con early stopping
    mlp = MLPClassifier(
        hidden_layer_sizes=(100,),
        max_iter=500,
        early_stopping=False,
        validation_fraction=0.1,
        n_iter_no_change=3,
        random_state=42
    )
    model = make_pipeline(StandardScaler(), mlp)
    model.fit(X_train, y_train)

    # 5. Predicción
    y_pred = model.predict(X_val)
    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # 6. Matriz de confusión por fold
    cm = confusion_matrix(y_val, y_pred, labels=np.unique(y))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f'Matriz de confusión - Fold {fold}')
    plt.xlabel("Predicho")
    plt.ylabel("Verdadero")
    plt.tight_layout()
    plt.show()

    

# Matriz de confusión total
cm_total = confusion_matrix(all_y_true, all_y_pred, labels=np.unique(y))
plt.figure(figsize=(7, 6))
sns.heatmap(cm_total, annot=True, fmt="d", cmap="Greens", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Matriz de confusión global (todos los folds)')
plt.xlabel("Predicho")
plt.ylabel("Verdadero")
plt.tight_layout()
plt.show()

# 7. Curva de aprendizaje (accuracy vs training size)
train_sizes, train_scores, test_scores = learning_curve(
    model, X, y, cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), shuffle=True, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Precisión en entrenamiento')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')

plt.plot(train_sizes, test_mean, 'o-', color='green', label='Precisión en validación')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color='green')

plt.title('Curva de aprendizaje - MLPClassifier')
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('Precisión')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()


plt.show()
# 6. Métricas globales (macro)
accuracy = accuracy_score(all_y_true, all_y_pred)
f1 = f1_score(all_y_true, all_y_pred, average='macro')
recall = recall_score(all_y_true, all_y_pred, average='macro')
precision = precision_score(all_y_true, all_y_pred, average='macro')
kappa = cohen_kappa_score(all_y_true, all_y_pred)
mcc = matthews_corrcoef(all_y_true, all_y_pred)

# Mostrar resultados
print("\n📊 Métricas globales (macro):")
print(f"Accuracy          : {accuracy:.4f}")
print(f"F1 Score (macro)  : {f1:.4f}")
print(f"Recall (macro)    : {recall:.4f}")
print(f"Precision (macro) : {precision:.4f}")
print(f"Cohen's Kappa     : {kappa:.4f}")
print(f"Matthews CorrCoef : {mcc:.4f}")
