# 🚀 Entrenamiento de Modelos Optimizados - Google Colab

**EcoPrint AI - Sistema de Predicción de Riesgo de Incendios Forestales**

## 🎯 Objetivo:
- Entrenar Random Forest y Extra Trees optimizados
- Alcanzar 95%+ accuracy (como en README_MODELOS.md)
- Usar paralelización de Colab para velocidad
- Exportar modelos para A/B Testing

## 📊 Modelos a entrenar:
- **Random Forest:** Target 95.41%
- **Extra Trees:** Target 95.5-96.0%
- **XGBoost:** Ya tenemos 97.07% ✅


## 🔧 Configuración Inicial


In [1]:
# Instalar paquetes necesarios
%pip install ucimlrepo scikit-learn xgboost joblib pandas numpy matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import time
from datetime import datetime
import json

# Configurar visualizaciones
plt.style.use('default')
sns.set_palette("husl")

print("✅ Librerías importadas correctamente")


✅ Librerías importadas correctamente


## 📊 Cargar Datos


In [3]:
# Cargar Forest Cover Type Dataset
from ucimlrepo import fetch_ucirepo

print("📊 Cargando Forest Cover Type Dataset...")
covertype = fetch_ucirepo(id=31)

# Obtener datos
X = covertype.data.features
y = covertype.data.targets

# Convertir y a serie si es necesario
if hasattr(y, 'iloc'):
    y = y.iloc[:, 0]  # Tomar la primera columna si es DataFrame

print(f"✅ Datos cargados: {X.shape[0]} muestras, {X.shape[1]} features")
print(f"✅ Clases: {sorted(y.unique())}")
print(f"✅ Distribución de clases:")
print(y.value_counts().sort_index())

# Mostrar información del dataset
print(f"\n📊 Información del dataset:")
print(f"   - Nombres de features: {list(X.columns)}")
print(f"   - Tipos de datos: {X.dtypes.value_counts()}")
print(f"   - Valores nulos: {X.isnull().sum().sum()}")


📊 Cargando Forest Cover Type Dataset...
✅ Datos cargados: 581012 muestras, 54 features
✅ Clases: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]
✅ Distribución de clases:
Cover_Type
1    211840
2    283301
3     35754
4      2747
5      9493
6     17367
7     20510
Name: count, dtype: int64

📊 Información del dataset:
   - Nombres de features: ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type2

## 🔄 Preprocesamiento de Datos


In [4]:
# Convertir clases a 0-based (como XGBoost)
y_original = y.copy()
y = y - 1

print(f"✅ Clases convertidas: {sorted(y.unique())}")
print(f"✅ Distribución después de conversión:")
print(y.value_counts().sort_index())

# Dividir datos (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n📊 División de datos:")
print(f"   - Entrenamiento: {X_train.shape[0]} muestras")
print(f"   - Prueba: {X_test.shape[0]} muestras")
print(f"   - Features: {X_train.shape[1]}")


✅ Clases convertidas: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
✅ Distribución después de conversión:
Cover_Type
0    211840
1    283301
2     35754
3      2747
4      9493
5     17367
6     20510
Name: count, dtype: int64

📊 División de datos:
   - Entrenamiento: 464809 muestras
   - Prueba: 116203 muestras
   - Features: 54


## 🌲 Entrenamiento de Random Forest Optimizado


In [5]:
# RANDOM FOREST OPTIMIZADO
print("🌲 ENTRENANDO RANDOM FOREST OPTIMIZADO...")
print("=" * 60)

# Parámetros optimizados basados en README_MODELOS.md
rf_params = {
    'n_estimators': [200, 300, 500],
    'max_depth': [15, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

print(f"📊 Grid de parámetros: {len(rf_params['n_estimators']) * len(rf_params['max_depth']) * len(rf_params['min_samples_split']) * len(rf_params['min_samples_leaf']) * len(rf_params['max_features'])} combinaciones")

# Grid Search con validación cruzada
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_params,
    cv=3,  # 3-fold CV para velocidad
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Entrenar
start_time = time.time()
rf_grid.fit(X_train, y_train)
rf_time = time.time() - start_time

# Mejor modelo
rf_best = rf_grid.best_estimator_
rf_best_params = rf_grid.best_params_
rf_best_score = rf_grid.best_score_

print(f"✅ Random Forest entrenado en {rf_time:.1f} segundos")
print(f"📊 Mejor score CV: {rf_best_score:.4f} ({rf_best_score:.2%})")
print(f"📊 Mejores parámetros: {rf_best_params}")

# Evaluar en test
rf_pred = rf_best.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"🎯 Accuracy en test: {rf_accuracy:.4f} ({rf_accuracy:.2%})")

# Calcular overfitting
rf_train_pred = rf_best.predict(X_train)
rf_train_accuracy = accuracy_score(y_train, rf_train_pred)
rf_overfitting = rf_train_accuracy - rf_accuracy

print(f"📈 Training accuracy: {rf_train_accuracy:.4f} ({rf_train_accuracy:.2%})")
print(f"📉 Overfitting: {rf_overfitting:.4f} ({rf_overfitting:.2%})")

if rf_overfitting < 0.05:
    print("✅ Overfitting controlado (<5%)")
else:
    print("⚠️ Overfitting alto (>5%)")


🌲 ENTRENANDO RANDOM FOREST OPTIMIZADO...
📊 Grid de parámetros: 72 combinaciones
Fitting 3 folds for each of 72 candidates, totalling 216 fits


KeyboardInterrupt: 

## 🌳 Entrenamiento de Extra Trees Optimizado


In [None]:
# EXTRA TREES OPTIMIZADO
print("🌳 ENTRENANDO EXTRA TREES OPTIMIZADO...")
print("=" * 60)

# Parámetros optimizados para Extra Trees
et_params = {
    'n_estimators': [200, 300, 500],
    'max_depth': [15, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

print(f"📊 Grid de parámetros: {len(et_params['n_estimators']) * len(et_params['max_depth']) * len(et_params['min_samples_split']) * len(et_params['min_samples_leaf']) * len(et_params['max_features'])} combinaciones")

# Grid Search con validación cruzada
et_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=42, n_jobs=-1),
    et_params,
    cv=3,  # 3-fold CV para velocidad
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Entrenar
start_time = time.time()
et_grid.fit(X_train, y_train)
et_time = time.time() - start_time

# Mejor modelo
et_best = et_grid.best_estimator_
et_best_params = et_grid.best_params_
et_best_score = et_grid.best_score_

print(f"✅ Extra Trees entrenado en {et_time:.1f} segundos")
print(f"📊 Mejor score CV: {et_best_score:.4f} ({et_best_score:.2%})")
print(f"📊 Mejores parámetros: {et_best_params}")

# Evaluar en test
et_pred = et_best.predict(X_test)
et_accuracy = accuracy_score(y_test, et_pred)

print(f"🎯 Accuracy en test: {et_accuracy:.4f} ({et_accuracy:.2%})")

# Calcular overfitting
et_train_pred = et_best.predict(X_train)
et_train_accuracy = accuracy_score(y_train, et_train_pred)
et_overfitting = et_train_accuracy - et_accuracy

print(f"📈 Training accuracy: {et_train_accuracy:.4f} ({et_train_accuracy:.2%})")
print(f"📉 Overfitting: {et_overfitting:.4f} ({et_overfitting:.2%})")

if et_overfitting < 0.05:
    print("✅ Overfitting controlado (<5%)")
else:
    print("⚠️ Overfitting alto (>5%)")


## 📊 Comparación de Modelos


In [None]:
# COMPARACIÓN FINAL
print("📊 COMPARACIÓN DE MODELOS OPTIMIZADOS")
print("=" * 60)

# Crear DataFrame de comparación
comparison_data = {
    'Modelo': ['XGBoost', 'Random Forest', 'Extra Trees'],
    'Accuracy': [0.9707, rf_accuracy, et_accuracy],  # XGBoost del README
    'Overfitting': [0.0292, rf_overfitting, et_overfitting],  # XGBoost del README
    'Tiempo (min)': ['45', f'{rf_time/60:.1f}', f'{et_time/60:.1f}']
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

print(comparison_df.to_string(index=False))

# Determinar el mejor modelo
best_model_name = comparison_df.iloc[0]['Modelo']
best_accuracy = comparison_df.iloc[0]['Accuracy']

print(f"\n🏆 MEJOR MODELO: {best_model_name}")
print(f"🎯 Accuracy: {best_accuracy:.4f} ({best_accuracy:.2%})")

# Verificar objetivos
print(f"\n✅ OBJETIVOS:")
print(f"   - Accuracy >95%: {'✅' if best_accuracy > 0.95 else '❌'}")
print(f"   - Overfitting <5%: {'✅' if comparison_df.iloc[0]['Overfitting'] < 0.05 else '❌'}")


## 💾 Guardar Modelos para A/B Testing


In [None]:
# GUARDAR MODELOS OPTIMIZADOS
print("💾 GUARDANDO MODELOS PARA A/B TESTING...")
print("=" * 60)

# Crear metadata para Random Forest
rf_metadata = {
    "model_info": {
        "name": "Random Forest Forest Cover Type Classifier (Optimized)",
        "version": "2.0.0",
        "algorithm": "RandomForest",
        "accuracy": float(rf_accuracy),
        "overfitting": float(rf_overfitting),
        "training_time_minutes": float(rf_time/60),
        "created_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "description": "Modelo Random Forest optimizado para A/B Testing"
    },
    "performance": {
        "accuracy": float(rf_accuracy),
        "accuracy_percentage": f"{rf_accuracy:.2%}",
        "overfitting": float(rf_overfitting),
        "overfitting_percentage": f"{rf_overfitting:.2%}",
        "training_time_minutes": float(rf_time/60),
        "best_params": rf_best_params,
        "cv_score": float(rf_best_score)
    },
    "preprocessing": {
        "class_conversion": "0-based indexing",
        "train_test_split": 0.2,
        "stratify": True,
        "scaling": "Not required for Random Forest"
    },
    "usage": {
        "input_shape": [54],
        "output_classes": [0, 1, 2, 3, 4, 5, 6],
        "class_names": [
            "Spruce/Fir",
            "Lodgepole Pine", 
            "Ponderosa Pine",
            "Cottonwood/Willow",
            "Aspen",
            "Douglas-fir",
            "Krummholz"
        ],
        "requires_scaling": False
    }
}

# Crear metadata para Extra Trees
et_metadata = {
    "model_info": {
        "name": "Extra Trees Forest Cover Type Classifier (Optimized)",
        "version": "2.0.0",
        "algorithm": "ExtraTrees",
        "accuracy": float(et_accuracy),
        "overfitting": float(et_overfitting),
        "training_time_minutes": float(et_time/60),
        "created_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "description": "Modelo Extra Trees optimizado para A/B Testing"
    },
    "performance": {
        "accuracy": float(et_accuracy),
        "accuracy_percentage": f"{et_accuracy:.2%}",
        "overfitting": float(et_overfitting),
        "overfitting_percentage": f"{et_overfitting:.2%}",
        "training_time_minutes": float(et_time/60),
        "best_params": et_best_params,
        "cv_score": float(et_best_score)
    },
    "preprocessing": {
        "class_conversion": "0-based indexing",
        "train_test_split": 0.2,
        "stratify": True,
        "scaling": "Not required for Extra Trees"
    },
    "usage": {
        "input_shape": [54],
        "output_classes": [0, 1, 2, 3, 4, 5, 6],
        "class_names": [
            "Spruce/Fir",
            "Lodgepole Pine", 
            "Ponderosa Pine",
            "Cottonwood/Willow",
            "Aspen",
            "Douglas-fir",
            "Krummholz"
        ],
        "requires_scaling": False
    }
}

# Guardar modelos
joblib.dump(rf_best, 'random_forest_optimized.pkl')
joblib.dump(et_best, 'extra_trees_optimized.pkl')

# Guardar metadata
with open('random_forest_optimized_metadata.json', 'w') as f:
    json.dump(rf_metadata, f, indent=2)

with open('extra_trees_optimized_metadata.json', 'w') as f:
    json.dump(et_metadata, f, indent=2)

print("✅ Modelos guardados:")
print("   - random_forest_optimized.pkl")
print("   - extra_trees_optimized.pkl")
print("   - random_forest_optimized_metadata.json")
print("   - extra_trees_optimized_metadata.json")

print(f"\n📊 RESUMEN FINAL:")
print(f"   🌲 Random Forest: {rf_accuracy:.2%} (Overfitting: {rf_overfitting:.2%})")
print(f"   🌳 Extra Trees: {et_accuracy:.2%} (Overfitting: {et_overfitting:.2%})")
print(f"   🚀 XGBoost: 97.07% (Overfitting: 2.92%)")

print(f"\n🎯 ¡LISTO PARA A/B TESTING!")
