## Carga de datos

In [1]:
# Librerías principales

import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
# Cargar el dataset
cancer_data = load_breast_cancer()
print(cancer_data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [3]:
# Crear un DataFrame
df_features = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df_target = pd.DataFrame(cancer_data.target, columns=['target'])

# Unir ambos DataFrames en uno solo
df = pd.concat([df_features, df_target], axis=1)

In [4]:
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [8]:
# Conteo de clases en variable objetivo
df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [7]:
# Mostrar estadísticas descriptivas
print("\nResumen estadístico de las características:")
df.describe()


Resumen estadístico de las características:


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


## Preprocesamiento

In [9]:
# Separar las características (X) del objetivo (y)
X = df.drop('target', axis=1)
y = df['target']

print("Dimensiones de X (características):", X.shape)
print("Dimensiones de y (objetivo):", y.shape)

Dimensiones de X (características): (569, 30)
Dimensiones de y (objetivo): (569,)


In [10]:
# Dividir los datos en entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print("Tamaño del conjunto de entrenamiento:", X_train.shape[0], "muestras")
print("Tamaño del conjunto de prueba:", X_test.shape[0], "muestras")

Tamaño del conjunto de entrenamiento: 455 muestras
Tamaño del conjunto de prueba: 114 muestras


In [11]:
# Inicializar el escalador
scaler = StandardScaler()

# Ajustar el escalador con los datos de entrenamiento y transformarlos
X_train_scaled = scaler.fit_transform(X_train)

# Transformar los datos de prueba usando el mismo escalador
X_test_scaled = scaler.transform(X_test)

# Convertir los arrays de numpy a DataFrames para visualizarlos (opcional)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

## Modelamiento

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                             recall_score, f1_score, classification_report)

- **Random Forest con Grid Search**

In [13]:
# Definir la parrilla de hiperparámetros a probar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]}

# Inicializar el modelo RandomForest
rf = RandomForestClassifier(random_state=42)

# Inicializar GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [14]:
# Entrenar GridSearchCV para encontrar los mejores parámetros
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [5, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
# Obtener los mejores parámetros encontrados y guardar
best_params = grid_search.best_params_
print(f"\nMejores hiperparámetros encontrados:\n{best_params}")

best_rf_model = grid_search.best_estimator_

# Realizar predicciones en el conjunto de prueba
y_pred = best_rf_model.predict(X_test_scaled)


Mejores hiperparámetros encontrados:
{'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}


In [16]:
# Métricas de evaluación (multiclase)
metrics = {
    "accuracy": float(accuracy_score(y_test, y_pred)),
    "precision_macro": float(precision_score(y_test, y_pred, average="macro")),
    "recall_macro": float(recall_score(y_test, y_pred, average="macro")),
    "f1_macro": float(f1_score(y_test, y_pred, average="macro")),}

print("Métricas en test:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

# Reporte de clasificación
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=cancer_data.target_names))

Métricas en test:
accuracy: 0.9561
precision_macro: 0.9551
recall_macro: 0.9504
f1_macro: 0.9526

Reporte de clasificación:
              precision    recall  f1-score   support

   malignant       0.95      0.93      0.94        42
      benign       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



## Guardar Artefactos del Modelo

In [17]:
# Crearun directorio 'artifacts' para guardar archivos
from pathlib import Path
ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)

- **Serialización del modelo**

In [18]:
# Guardar el modelo en ruta creada
model_filename = 'modelo_breastcancer.pkl'
full_model_path = ARTIFACTS_DIR / model_filename
joblib.dump(best_rf_model, full_model_path)

['artifacts\\modelo_breastcancer.pkl']

- **Scaler utilizado**

Las predicciones deben usar datos escalados de la misma manera que fue entrenado. Cuando se envían nuevos datos a la API para hacer una predicción, estos datos vienen en su escala original, si se pasan directamente al modelo, él los interpretará incorrectamente porque no están en la escala que espera.

In [19]:
# Guardar el scaler
scaler_filename = 'scaler.pkl'
full_scaler_path = ARTIFACTS_DIR / scaler_filename
joblib.dump(scaler, full_scaler_path)

['artifacts\\scaler.pkl']

- Manifiesto/Model Card

In [20]:
import time
import platform
import json

In [21]:
# Definir manifiesto para guardar en JSON
manifest = {
    "name": "RandomForest-BreastCancer",
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "framework": "scikit-learn",
    "sklearn_version": sklearn.__version__,
    "python_version": platform.python_version(),
    "features": list(X.columns),
    "target": y.name,
    "scaler_info": {
    "type": "StandardScaler",
    "fitted_on_training_data": True,
    "scaler_path": str(full_scaler_path)},
    "best_params": grid_search.best_params_,
    "cv_metric": grid_search.scoring,
    "cv_best_score": float(grid_search.best_score_),
    "test_metrics": metrics,}

manifest_path = ARTIFACTS_DIR / "model_manifest.json"

# Escribimos el diccionario en el archivo JSON con un formato legible.
with open(manifest_path, "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print(f"Manifest del modelo guardado en: {manifest_path.resolve()}")

Manifest del modelo guardado en: C:\Users\HP\api breast cancer\artifacts\model_manifest.json


In [22]:
print("\nContenido del Manifiesto:")
print(json.dumps(manifest, indent=2))


Contenido del Manifiesto:
{
  "name": "RandomForest-BreastCancer",
  "created_at": "2025-09-28 20:08:37",
  "framework": "scikit-learn",
  "sklearn_version": "1.7.2",
  "python_version": "3.10.2",
  "features": [
    "mean radius",
    "mean texture",
    "mean perimeter",
    "mean area",
    "mean smoothness",
    "mean compactness",
    "mean concavity",
    "mean concave points",
    "mean symmetry",
    "mean fractal dimension",
    "radius error",
    "texture error",
    "perimeter error",
    "area error",
    "smoothness error",
    "compactness error",
    "concavity error",
    "concave points error",
    "symmetry error",
    "fractal dimension error",
    "worst radius",
    "worst texture",
    "worst perimeter",
    "worst area",
    "worst smoothness",
    "worst compactness",
    "worst concavity",
    "worst concave points",
    "worst symmetry",
    "worst fractal dimension"
  ],
  "target": "target",
  "scaler_info": {
    "type": "StandardScaler",
    "fitted_on_t

In [25]:
# Convertir a script
!jupyter nbconvert --to script entrenamiento_breastcancer.ipynb

[NbConvertApp] Converting notebook entrenamiento_breastcancer.ipynb to script
[NbConvertApp] Writing 5705 bytes to entrenamiento_breastcancer.py
