In [15]:
# Importaci√≥n de librer√≠as

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, confusion_matrix, roc_curve, auc)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import mlflow
from mlflow.models import infer_signature

In [2]:
# Carga de los datos 

DATA_PATH = r"C:\Users\abrah\Documents\Maestr√≠a\Segundo_semestre\Programacion_2\Challenge_1\data\breast-cancer-wisconsin.data.csv"
RESULTS_DIR = r"C:\Users\abrah\Documents\Maestr√≠a\Segundo_semestre\Programacion_2\Challenge_1\Im√°genes"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [3]:
# Muestro las primeras filas para conocer la estructura y columnas

df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
#Reviso el tama√±o del documento

df.shape

(569, 31)

In [7]:
df_numeric = df.apply(pd.to_numeric, errors='coerce')  # Convierte a num√©rico, poniendo NaN si no puede
df_numeric = df.select_dtypes(include=[np.number]) 

In [5]:
# Carga y preparacu√≥n de los datos

df = pd.read_csv(DATA_PATH)
df = df.drop(columns=["id", "Unnamed: 32"])

# Codificar target
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

# Verificar datos
print("\n Datos cargados correctamente:")
print(f"- Filas: {df.shape[0]}, Columnas: {df.shape[1]}")
print(f"- Distribuci√≥n de clases: {dict(pd.Series(df['diagnosis']).value_counts())}")


 Datos cargados correctamente:
- Filas: 569, Columnas: 31
- Distribuci√≥n de clases: {0: 357, 1: 212}


In [6]:
# Procesamiento

X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Normalizaci√≥n
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Entrenamiento del modelo

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)


In [9]:
# Evaluaci√≥n del modelo

y_pred = knn.predict(X_test_scaled)
y_probs = knn.predict_proba(X_test_scaled)[:, 1]

# M√©tricas
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred),
    'AUC-ROC': roc_auc_score(y_test, y_probs)
}

print("\n M√©tricas finales:")
for k, v in metrics.items():
    print(f"- {k}: {v:.4f}")


 M√©tricas finales:
- Accuracy: 0.9649
- Precision: 1.0000
- Recall: 0.9062
- F1-Score: 0.9508
- AUC-ROC: 0.9882


In [10]:
# Visualizaciones

# Matriz de confusi√≥n
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), 
            annot=True, fmt='d', cmap='Blues',
            xticklabels=['Benigno', 'Maligno'],
            yticklabels=['Benigno', 'Maligno'])
plt.title("Matriz de Confusi√≥n")
plt.savefig(os.path.join(RESULTS_DIR, "confusion_matrix.png"))
plt.close()

# Curva ROC
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', label=f'ROC (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='navy')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Curva ROC')
plt.legend()
plt.savefig(os.path.join(RESULTS_DIR, "roc_curve.png"))
plt.close()


In [11]:
# Guardar el modelo entrenado

joblib.dump(knn, os.path.join(RESULTS_DIR, 'knn_model.pkl'))
joblib.dump(scaler, os.path.join(RESULTS_DIR, 'scaler.pkl'))
np.save(os.path.join(RESULTS_DIR, 'feature_names.npy'), X.columns.values)

print(f"\n Proceso completado! Resultados guardados en:\n{RESULTS_DIR}")


 Proceso completado! Resultados guardados en:
C:\Users\abrah\Documents\Maestr√≠a\Segundo_semestre\Programacion_2\Challenge_1\Im√°genes


In [None]:
# MLFlow

# 1. Configura la conexi√≥n 
mlflow.set_tracking_uri("http://localhost:5000") 

# 2. Crea/verifica el experimento
EXPERIMENT_NAME = "Breast Cancer - KNN"
if not mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    mlflow.create_experiment(EXPERIMENT_NAME)
mlflow.set_experiment(EXPERIMENT_NAME)

# 3. Registra todo en MLFlow
with mlflow.start_run(run_name="KNN_Full_Run") as run:
    # Par√°metros
    mlflow.log_params({
        "model_type": "KNN",
        "n_neighbors": 5,
        "scaler": "StandardScaler"
    })
    
    # M√©tricas
    mlflow.log_metrics(metrics)
    
    # Artefactos
    mlflow.log_artifacts(RESULTS_DIR)
    
    # Modelo con metadatos
    input_example = X_train_scaled[:1]
    signature = infer_signature(input_example, knn.predict(input_example))
    
    mlflow.sklearn.log_model(
        sk_model=knn,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )
    
    print(f"\n ¬°Datos registrados! Accede en: http://localhost:5000/#/experiments/{mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id}/runs/{run.info.run_id}")

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 2236.79it/s]



üîç ¬°Datos registrados! Accede en: http://localhost:5000/#/experiments/1/runs/57dd5b0b6b9e4fdd8f30b029bec86801
üèÉ View run KNN_Full_Run at: http://localhost:5000/#/experiments/1/runs/57dd5b0b6b9e4fdd8f30b029bec86801
üß™ View experiment at: http://localhost:5000/#/experiments/1
