In [None]:
import os
import mlflow
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import matplotlib.pyplot as plt


# 1. Configuración de MLflow
mlflow_dir = os.path.join(os.getcwd(), "mlruns")
os.makedirs(mlflow_dir, exist_ok=True)
mlflow.set_tracking_uri(f"file:{mlflow_dir}")

# Configurar experimento
experiment_name = "MSRP_PREDICTION"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name, artifact_location=mlflow_dir)
mlflow.set_experiment(experiment_name)






ModuleNotFoundError: No module named 'mlflow'

In [3]:
# 2. Cargar datos
try:
    data_path = os.path.join('data', 'msrp_clean.csv')
    msrp_new = pd.read_csv(data_path)
    print(" Datos cargados correctamente")
except FileNotFoundError:
    raise Exception(" Error: No se encontró el archivo de datos")

    # 3. Preprocesamiento
features = ['Make', 'Year', 'Engine Fuel Type', 'Engine HP', 'Engine Cylinders',
               'Vehicle Style', 'highway MPG', 'city mpg', 'Popularity']
target = 'MSRP'

X = msrp_new[features].copy()
y = msrp_new[target]

    # Codificar variables categóricas
X = pd.get_dummies(X, drop_first=True)

    # División del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


IndentationError: unexpected indent (<ipython-input-3-e94f9ba004d9>, line 14)

In [None]:
# 4. Entrenamiento y tracking
with mlflow.start_run(run_name="ExtraTrees_v1"):
       # Modelo
    model = ExtraTreesRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

        # Métricas
    metrics = {
        "mae": mean_absolute_error(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "r2_score": r2_score(y_test, y_pred)
        }

        # Log de parámetros y métricas
    mlflow.log_params({
        "random_state": 42,
        "model_type": "ExtraTreesRegressor",
           "features": str(features)
        })
    mlflow.log_metrics(metrics)

        # Guardar artefactos
    artifacts = {
        "modelo_MSRP.pkl": model,
        "msrp_logged.csv": msrp_new,
        "predicciones_msrp.csv": pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
        }

for filename, data in artifacts.items():
    if filename.endswith('.pkl'):
       joblib.dump(data, filename)
    else:
        data.to_csv(filename, index=False)
        mlflow.log_artifact(filename)


        # Gráfico de predicción vs real
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Valor real")
plt.ylabel("Predicción")
plt.title("Predicción vs Real (MSRP)")
plt.grid(True)
plt.savefig("pred_vs_real.png")
mlflow.log_artifact("pred_vs_real.png")
plt.close()

In [None]:
 # Registrar modelo
mlflow.sklearn.log_model(model, "modelo_MSRP", input_example=X.iloc[:2])

print(" Entrenamiento completado")
print(f" Métricas: {metrics}")

model = joblib.load("extra_trees_model.joblib")

#para usar el modelo en otro script: predictions = model.predict(new_data)