In [1]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

import sys
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
import wandb
wandb.init(project="Final-Project-TimeSeries-UTEC", name="RandomForest-temp-spec", config={
    "n_splits": 7
})

# Models

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# definir clases para random forest, KNN, SVM para clasificación y ser llamado en models/train_classic.py
class RandomForestModel:
    def __init__(self, n_estimators=100, random_state=None):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    
    def get_params(self, deep=True):
        return {"n_estimators": self.n_estimators,
                "random_state": self.random_state}

class KNNModel:
    def __init__(self, n_neighbors=6):
        self.n_neighbors = n_neighbors
        self.model = KNeighborsClassifier(n_neighbors=n_neighbors)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    
    def get_params(self, deep=True):
        return {'n_neighbors': self.n_neighbors}

class SVMModel:
    def __init__(self, kernel='linear', C=1.0, probability=True):
        self.kernel = kernel
        self.C = C
        self.probability = probability
        self.model = SVC(kernel=kernel, C=C, probability=probability, random_state=42)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        if not self.probability:
            raise RuntimeError("SVM must be initialized with probability=True to use predict_proba")
        return self.model.predict_proba(X)
    
    def get_params(self, deep=True):
        return {"kernel": self.kernel, "C": self.C, "probability": self.probability}




# Utils

In [3]:
import wandb
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

from copy import deepcopy
from sklearn.base import clone as sk_clone

import time

In [4]:
def clone_custom_model(model):
    """Función para clonar modelos personalizados"""
    if hasattr(model, 'get_params'):
        # Si es un modelo de sklearn o tiene interfaz similar
        params = model.get_params()
        if isinstance(model, RandomForestModel):
            return RandomForestModel(**params)
        elif isinstance(model, KNNModel):
            return KNNModel(**params)
        elif isinstance(model, SVMModel):
            return SVMModel(**params)
    # Si no es reconocido, hacer una copia profunda
    return deepcopy(model)

In [26]:

def kfold_trad_models(models, X, y, n_splits=7):
    results = []
    classes = sorted(y.unique())
    y_bin = label_binarize(y, classes=classes)
    
    for model_name, model in models:
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Métricas a recolectar
        f1_scores = []
        bal_acc_scores = []
        acc_scores = []
        roc_auc_scores = []
        fold_times = []
        
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            y_test_bin = label_binarize(y_test, classes=classes)
            
            # Clonar el modelo personalizado
            current_model = deepcopy(model)
            
            # Medir tiempo de entrenamiento y predicción
            start_time = time.time()
            
            current_model.fit(X_train, y_train)
            
            # Predicciones
            y_pred = current_model.predict(X_test)
            
            # Para ROC-AUC necesitamos probabilidades (ajustar SVM)
            try:
                y_prob = current_model.predict_proba(X_test)
            except AttributeError:
                if hasattr(current_model, 'model') and hasattr(current_model.model, 'probability'):
                    current_model.model.probability = True
                    current_model.fit(X_train, y_train)
                    y_prob = current_model.model.predict_proba(X_test)
                else:
                    raise
            
            # Calcular métricas
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
            bal_acc_scores.append(balanced_accuracy_score(y_test, y_pred))
            acc_scores.append(accuracy_score(y_test, y_pred))
            
            # ROC-AUC (para multiclase)
            if len(classes) > 2:
                roc_auc_scores.append(roc_auc_score(y_test_bin, y_prob, multi_class='ovr'))
            else:
                roc_auc_scores.append(roc_auc_score(y_test, y_prob[:, 1]))
            
            end_time = time.time()
            fold_times.append(end_time - start_time)
        
        # Guardar resultados para este modelo
        results.append({
            'Modelo': model_name,
            'F1-score': np.mean(f1_scores),
            'Balanced accuracy': np.mean(bal_acc_scores),
            'Accuracy': np.mean(acc_scores),
            'ROC-AUC': np.mean(roc_auc_scores),
            'Tiempo promedio (s)': np.mean(fold_times),
            'Tiempo total (s)': np.sum(fold_times)
        })
    
    return pd.DataFrame(results)

# Train_rf

## Kfold

In [27]:
project_root = os.path.dirname(os.getcwd())
data_path = os.path.join(project_root, 'data')
processed_path = data_path + '/processed'

save_models_path = os.path.join(project_root, 'models', 'save_models')
save_results_path = os.path.join(project_root, 'models', 'results')

features_temporal = pd.read_csv(processed_path + '/features_temporales_labelNum_overlap50.csv')
features_espectrales = pd.read_csv(processed_path + '/features_espectrales_labelNum_overlap50.csv')

X_temp = features_temporal.iloc[:, 1:-1] 
y_temp = features_temporal.iloc[:, -1]  
X_spec = features_espectrales.iloc[:, 1:-1] 
y_spec = features_espectrales.iloc[:, -1]  

# Dividir el dataset en entrenamiento y prueba
X_temp_train, X_temp_test, y_temp_train, y_temp_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)
X_spec_train, X_spec_test, y_spec_train, y_spec_test = train_test_split(X_spec, y_spec, test_size=0.2, random_state=42)

In [None]:
models = [
    #("RandomForest", RandomForestModel(n_estimators=100, random_state=42)),
    #("KNN", KNNModel(n_neighbors=6)),
    ("SVM", SVMModel(kernel='rbf', C=1.0, probability=True))
]

In [None]:
save_models_path = os.path.join(project_root, 'models', 'save_models') # mal perfom en .py
save_results_path = os.path.join(project_root, 'models', 'results') # mal perfom en .py
os.makedirs(save_models_path, exist_ok=True)
os.makedirs(save_results_path, exist_ok=True)

# PROBLABLEMENTE NO NECESARIO

results_cv = pd.DataFrame(columns=["Modelo", "F1-score", "Balanced accuracy", "Accuracy", "ROC-AUC", "Tiempo promedio (s)", "Tiempo total (s)"])
results_test = pd.DataFrame(columns=["Modelo", "F1-score", "Balanced accuracy", "Accuracy", "ROC-AUC", "Tiempo promedio (s)", "Tiempo total (s)"])

if not os.path.isfile(os.path.join(save_results_path, 'resultados_cv_temporales.csv')):
    results_cv.to_csv(os.path.join(save_results_path, 'resultados_cv_temporales.csv'), index=False)
if not os.path.isfile(os.path.join(save_results_path, 'resultados_test_temporales.csv')):
    results_test.to_csv(os.path.join(save_results_path, 'resultados_test_temporales.csv'), index=False)


In [40]:
#rf_model = ("RandomForest", RandomForestModel(n_estimators=100, random_state=42))

kfold_temp = kfold_trad_models(models, X_temp_train, y_temp_train)
#kfold_spec = kfold_trad_models(models, X_spec_train, y_spec_train)

In [41]:
kfold_temp

Unnamed: 0,Modelo,F1-score,Balanced accuracy,Accuracy,ROC-AUC,Tiempo promedio (s),Tiempo total (s)
0,KNN,0.910487,0.833977,0.912793,0.972734,0.128903,0.902321


In [32]:
kfold_spec

Unnamed: 0,Modelo,F1-score,Balanced accuracy,Accuracy,ROC-AUC,Tiempo promedio (s),Tiempo total (s)
0,RandomForest,0.924124,0.884841,0.927509,0.993042,20.375176,142.62623


In [35]:
# Guardar resultados de KFold temporales
csv_cv_temp = os.path.join(save_results_path, 'resultados_cv_temporales.csv')
if os.path.isfile(csv_cv_temp):
    df_cv_temp = pd.read_csv(csv_cv_temp)
    kfold_temp = pd.concat([df_cv_temp, kfold_temp], ignore_index=True)
kfold_temp.to_csv(csv_cv_temp, index=False)

# Guardar resultados de KFold espectrales
csv_cv_spec = os.path.join(save_results_path, 'resultados_cv_espectrales.csv')
if os.path.isfile(csv_cv_spec):
    df_cv_spec = pd.read_csv(csv_cv_spec)
    kfold_spec = pd.concat([df_cv_spec, kfold_spec], ignore_index=True)
kfold_spec.to_csv(csv_cv_spec, index=False)

## Training-test

In [36]:
def train_and_test_model(model, X_train, y_train, X_test, y_test, model_name, dataset_name):
    # Medición de tiempo total
    start_time = time.time()
    
    # Entrenamiento con medición de tiempo
    train_start = time.time()
    model.fit(X_train, y_train)
    train_end = time.time()
    
    # Predicción con medición de tiempo
    pred_start = time.time()
    preds = model.predict(X_test)
    
    # Cálculo de probabilidades para ROC-AUC
    classes = sorted(y_train.unique())
    y_test_bin = label_binarize(y_test, classes=classes)
    
    try:
        probas = model.predict_proba(X_test)
    except AttributeError:
        # Manejo especial para SVM
        if hasattr(model, 'model') and hasattr(model.model, 'probability'):
            model.model.probability = True
            model.fit(X_train, y_train)  # Reentrenar con probability=True
            probas = model.model.predict_proba(X_test)
        else:
            probas = None
    
    pred_end = time.time()
    
    # Cálculo de ROC-AUC
    roc_auc = None
    if probas is not None:
        if len(classes) > 2:
            roc_auc = roc_auc_score(y_test_bin, probas, multi_class='ovr')
        else:
            roc_auc = roc_auc_score(y_test, probas[:, 1])
    
    end_time = time.time()
    
    # Guardar modelo
    filename = f"{model_name.lower()}_{dataset_name.lower()}.pkl"
    joblib.dump(model, os.path.join(save_models_path, filename))
    
    return {
        'F1-score': f1_score(y_test, preds, average='weighted'),
        'Balanced accuracy': balanced_accuracy_score(y_test, preds),
        'Accuracy': accuracy_score(y_test, preds),
        'ROC-AUC': roc_auc,
        'Tiempo entrenamiento (s)': train_end - train_start,
        'Tiempo predicción (s)': pred_end - pred_start,
        'Tiempo total (s)': end_time - start_time,
        'Archivo': filename
    }

In [37]:
test_results_temp = []
test_results_spec = []

for model_name, model in models:
    # Clonar el modelo para cada ejecución
    current_model = deepcopy(model)
    
    # Temporales
    res_temp = train_and_test_model(current_model, X_temp_train, y_temp_train, X_temp_test, y_temp_test, 
                                  model_name, "features_temporales")
    test_results_temp.append({'Modelo': model_name, **res_temp})
    print(f"{model_name} - Test Results (Temporales):")
    print(pd.Series({k: v for k, v in res_temp.items() if k != 'Archivo'}))
    
    # Espectrales
    current_model = deepcopy(model)  # Nuevo clone para espectrales
    res_spec = train_and_test_model(current_model, X_spec_train, y_spec_train, X_spec_test, y_spec_test, 
                                  model_name, "features_espectrales")
    test_results_spec.append({'Modelo': model_name, **res_spec})
    print(f"\n{model_name} - Test Results (Espectrales):")
    print(pd.Series({k: v for k, v in res_spec.items() if k != 'Archivo'}))
    print("\n" + "="*80 + "\n")
    
# Convertir a DataFrames
df_test_temp = pd.DataFrame(test_results_temp)
df_test_spec = pd.DataFrame(test_results_spec)

# Reordenar columnas para mejor presentación
column_order = ['Modelo', 'F1-score', 'Balanced accuracy', 'Accuracy', 'ROC-AUC',
                'Tiempo entrenamiento (s)', 'Tiempo predicción (s)', 'Tiempo total (s)', 'Archivo']
df_test_temp = df_test_temp[column_order]
df_test_spec = df_test_spec[column_order]

RandomForest - Test Results (Temporales):
F1-score                    0.958805
Balanced accuracy           0.934855
Accuracy                    0.959790
ROC-AUC                     0.997051
Tiempo entrenamiento (s)    5.768524
Tiempo predicción (s)       0.080081
Tiempo total (s)            5.852917
dtype: float64

RandomForest - Test Results (Espectrales):
F1-score                     0.937376
Balanced accuracy            0.904040
Accuracy                     0.939977
ROC-AUC                      0.994127
Tiempo entrenamiento (s)    24.673821
Tiempo predicción (s)        0.143382
Tiempo total (s)            24.821681
dtype: float64




In [38]:
# Guardar resultados de KFold temporales
csv_test_temp = os.path.join(save_results_path, 'resultados_test_temporales.csv')
if os.path.isfile(csv_test_temp):
    df_test_temp_old = pd.read_csv(csv_test_temp)
    df_test_temp = pd.concat([df_test_temp_old, df_test_temp], ignore_index=True)
df_test_temp.to_csv(csv_test_temp, index=False)

# Guardar resultados de KFold espectrales
csv_test_spec = os.path.join(save_results_path, 'resultados_test_espectrales.csv')
if os.path.isfile(csv_test_spec):
    df_test_spec_old = pd.read_csv(csv_test_spec)
    df_test_spec = pd.concat([df_test_spec_old, df_test_spec], ignore_index=True)
df_test_spec.to_csv(csv_test_spec, index=False)

## wandb

In [None]:
# 1. Log de KFold (después de crossval_models)
for idx, row in kfold_temp.iterrows():
    wandb.log({
        "Modelo": row["Modelo"],
        "Dataset": "temporales",
        "Tipo": "validación_cruzada",
        "F1-score": row["F1-score"],
        "Balanced_accuracy": row["Balanced accuracy"],
        "Accuracy": row["Accuracy"],
        "ROC-AUC": row["ROC-AUC"],
        "Tiempo_promedio(s)": row["Tiempo promedio (s)"],
        "Tiempo_total(s)": row["Tiempo total (s)"]
    })

for idx, row in kfold_spec.iterrows():
    wandb.log({
        "Modelo": row["Modelo"],
        "Dataset": "espectrales",
        "Tipo": "validación_cruzada",
        "F1-score": row["F1-score"],
        "Balanced_accuracy": row["Balanced accuracy"],
        "Accuracy": row["Accuracy"],
        "ROC-AUC": row["ROC-AUC"],
        "Tiempo_promedio(s)": row["Tiempo promedio (s)"],
        "Tiempo_total(s)": row["Tiempo total (s)"]
    })

# 2. Log de test (después de test_results_temp y test_results_spec)
for idx, row in df_test_temp.iterrows():
    wandb.log({
        "Modelo": row["Modelo"],
        "Dataset": "temporales",
        "Tipo": "evaluación_test",
        "F1-score": row["F1-score"],
        "Balanced_accuracy": row["Balanced accuracy"],
        "Accuracy": row["Accuracy"],
        "ROC-AUC": row["ROC-AUC"],
        "Tiempo_entrenamiento(s)": row["Tiempo entrenamiento (s)"],
        "Tiempo_predicción(s)": row["Tiempo predicción (s)"],
        "Tiempo_total(s)": row["Tiempo total (s)"]
    })

for idx, row in df_test_spec.iterrows():
    wandb.log({
        "Modelo": row["Modelo"],
        "Dataset": "espectrales",
        "Tipo": "evaluación_test",
        "F1-score": row["F1-score"],
        "Balanced_accuracy": row["Balanced accuracy"],
        "Accuracy": row["Accuracy"],
        "ROC-AUC": row["ROC-AUC"],
        "Tiempo_entrenamiento(s)": row["Tiempo entrenamiento (s)"],
        "Tiempo_predicción(s)": row["Tiempo predicción (s)"],
        "Tiempo_total(s)": row["Tiempo total (s)"]
    })

In [None]:
# Opcional: Crear tablas resumen
wandb.log({
    "Resumen_KFold_Temporales": wandb.Table(dataframe=kfold_temp),
    "Resumen_KFold_Espectrales": wandb.Table(dataframe=kfold_spec),
    "Resumen_Test_Temporales": wandb.Table(dataframe=df_test_temp),
    "Resumen_Test_Espectrales": wandb.Table(dataframe=df_test_spec),
    #"Curva_ROC": wandb.plot.roc_curve(y_true, y_probs, labels=class_names)
})

wandb.finish()