In [1]:
# 1. Imports Generales
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import metrics

# Configuración opcional para ver todas las columnas en pandas
pd.set_option('display.max_columns', None)

In [2]:
# 2. Carga y División de Datos (Breast Cancer Dataset)
data = load_breast_cancer()

# Ver la estructura de los datos (opcional)
# print(data.DESCR)
df_data = pd.DataFrame(data['data'], columns=data['feature_names'])
df_target = pd.DataFrame(data['target'], columns=['target'])

# División Train/Test (75% Train, 25% Test)
X_train, X_test, y_train, y_test = train_test_split(
    data.data, 
    data.target, 
    random_state=42, 
    test_size=0.25
)

print(f"Dimensiones Train: {X_train.shape}")
print(f"Dimensiones Test: {X_test.shape}")

# Instanciar y Entrenar el Modelo (Decision Tree)
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)

# Ver resultados preliminares
print("Predicciones (primeros 10):", y_pred[:10])
print("Realidad (primeros 10):    ", y_test[:10])

Dimensiones Train: (426, 30)
Dimensiones Test: (143, 30)
Predicciones (primeros 10): [1 0 0 1 1 0 0 0 0 1]
Realidad (primeros 10):     [1 0 0 1 1 0 0 0 1 1]


In [3]:
# 3. Evaluación Manual (Comparación Visual)
# Reshape para crear DataFrames verticales
pred_2d = y_pred.reshape(len(y_pred), 1)
y_test_2d = y_test.reshape(len(y_test), 1)

# Crear DataFrames
df1 = pd.DataFrame(pred_2d, columns=['pred'])
df2 = pd.DataFrame(y_test_2d, columns=['real'])

# Concatenar para comparar lado a lado
df_concat = pd.concat([df1, df2], axis=1)

# Mostrar solo donde coinciden (Aciertos)
aciertos = df_concat[df_concat['pred'] == df_concat['real']]
print(f"Número de aciertos: {len(aciertos)} de {len(df_concat)}")
print(f"Precisión manual: {len(aciertos)/len(df_concat)}")

Número de aciertos: 137 de 143
Precisión manual: 0.958041958041958


In [4]:
# 4.1. Estandarización (StandardScaler)
scaler = StandardScaler()

# Ajustar (aprender media/desviación) solo con Train
scaler.fit(X_train)

# Transformar (aplicar fórmula) a Train y Test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test) # OJO: Solo transform, no fit

# Verificación visual (Convertir a DF para ver mejor)
df_scale_after = pd.DataFrame(X_train_scaled, columns=data['feature_names'])
print("Datos escalados (primeras filas):")
display(df_scale_after.head())

Datos escalados (primeras filas):


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,-0.349138,-1.438513,-0.411726,-0.390479,-1.863662,-1.268607,-0.826171,-0.952866,-1.729368,-0.941541,-0.869714,-1.358653,-0.834815,-0.572307,-0.745868,-0.653983,-0.525835,-0.946771,-0.537817,-0.634495,-0.542685,-1.655655,-0.589864,-0.52556,-1.510669,-0.8915,-0.750217,-0.916711,-0.925086,-0.808411
1,-0.204687,0.31264,-0.133673,-0.27588,1.078073,0.863546,0.726314,0.898441,1.17877,1.474377,-0.040223,-0.509623,0.109477,-0.134728,-0.524895,-0.149345,0.0746,0.237472,-0.430283,0.082891,0.041487,0.689899,0.194128,-0.051934,1.129415,0.923942,1.222217,1.43656,1.149559,1.569111
2,-0.329312,-0.215072,-0.317394,-0.364357,-1.57988,-0.457451,-0.59731,-0.764588,0.275343,-0.501024,-0.581453,0.167984,-0.222791,-0.415329,-1.102403,0.644912,0.0746,-0.19099,0.728842,-0.028967,-0.435901,-0.148985,-0.320159,-0.446032,-1.634396,-0.106752,-0.539891,-0.723713,0.53497,-0.619348
3,1.027403,2.089824,1.046922,0.917584,0.316303,0.562037,1.048527,0.930437,-0.325697,-0.477474,-0.043367,-0.240346,0.00445,0.079579,-0.751369,-0.286475,0.156326,-0.119314,-1.015472,-0.457385,1.113515,2.165006,1.165793,0.997696,0.383604,0.860948,1.872819,1.310691,0.152884,0.421636
4,1.828969,0.696001,1.763681,1.783821,-0.333674,0.628175,0.97466,1.26574,-0.131572,-1.713139,1.60024,0.500901,1.988514,1.491962,0.351882,0.588963,0.754093,2.516764,1.409062,-0.528602,1.471556,0.387568,1.556276,1.385595,-0.577759,0.29668,0.595768,1.232995,0.050452,-1.406351


In [5]:
# 4.2. Manejo de Valores Nulos (SimpleImputer)
# Crear datos de ejemplo con nulos
data_null = [[1.0, 2.0, 3.0, 4.0], 
             [5.0, 6.0, np.nan, 8.0], 
             [10.0, 11.0, 12.0, np.nan]]
df_null = pd.DataFrame(data_null, columns=['A', 'B', 'C', 'D'])

print("--- DataFrame con Nulos ---")
print(df_null)

# Opción A: Borrar (dropna)
# df_dropped = df_null.dropna() # Borra filas con cualquier NaN

# Opción B: Imputar (Rellenar con la media)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df_null.values)
data_imputed = imputer.transform(df_null.values)

print("\n--- Datos Imputados (Rellenados con media) ---")
print(data_imputed)

--- DataFrame con Nulos ---
      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN

--- Datos Imputados (Rellenados con media) ---
[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]


In [6]:
# 4.3. Encoding (LabelEncoder y OneHotEncoder)

# Datos de ejemplo: Camisetas
df_cat = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
], columns=['color', 'size', 'price', 'classlabel'])

# --- Mapeo Manual (Ordinal - Tallas) ---
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df_cat['size_num'] = df_cat['size'].map(size_mapping)

# --- Label Encoder (Para el Target/Clase) ---
enc = LabelEncoder()
df_cat['classlabel_enc'] = enc.fit_transform(df_cat['classlabel'].values)

print("--- DataFrame tras Mapping y LabelEncoder ---")
print(df_cat)

# --- One Hot Encoder (Para Colores - Nominales) ---
# Usando Pandas get_dummies (más fácil para visualización rápida)
dummies = pd.get_dummies(df_cat['color'], prefix='color')
print("\n--- One Hot Encoding (get_dummies) ---")
print(dummies)

--- DataFrame tras Mapping y LabelEncoder ---
   color size  price classlabel  size_num  classlabel_enc
0  green    M   10.1     class2         1               1
1    red    L   13.5     class1         2               0
2   blue   XL   15.3     class2         3               1

--- One Hot Encoding (get_dummies) ---
   color_blue  color_green  color_red
0       False         True      False
1       False        False       True
2        True        False      False


In [11]:
# 1. Imports y Carga de Datos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

# Cargar Iris
iris = load_iris()
X = iris.data
y = iris.target

# Nombres de clases para referencia
class_names = iris.target_names
print(f"Clases: {class_names}")
print(f"Dimensiones X: {X.shape}")

Clases: ['setosa' 'versicolor' 'virginica']
Dimensiones X: (150, 4)


In [12]:
# 2. Validación Cruzada (K-Fold y Stratified K-Fold)

# Configurar el validador (5 pliegues)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelo a evaluar
model_cv = DecisionTreeClassifier(random_state=42)

# Ejecutar Cross Validation
# Esto entrena 5 veces y saca 5 notas distintas
scores = cross_val_score(model_cv, X, y, cv=kfold, scoring='accuracy')

print(f"Puntajes de cada fold: {scores}")
print(f"Promedio de exactitud: {np.mean(scores):.4f}")
print(f"Desviación estándar: {np.std(scores):.4f}")
# Si la desviación es alta, tu modelo es inestable.

Puntajes de cada fold: [1.         0.96666667 0.93333333 0.96666667 0.9       ]
Promedio de exactitud: 0.9533
Desviación estándar: 0.0340


In [13]:
# 2. Validación Cruzada (K-Fold y Stratified K-Fold)

# Configurar el validador (5 pliegues)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelo a evaluar
model_cv = DecisionTreeClassifier(random_state=42)

# Ejecutar Cross Validation
# Esto entrena 5 veces y saca 5 notas distintas
scores = cross_val_score(model_cv, X, y, cv=kfold, scoring='accuracy')

print(f"Puntajes de cada fold: {scores}")
print(f"Promedio de exactitud: {np.mean(scores):.4f}")
print(f"Desviación estándar: {np.std(scores):.4f}")
# Si la desviación es alta, tu modelo es inestable.

Puntajes de cada fold: [1.         0.96666667 0.93333333 0.96666667 0.9       ]
Promedio de exactitud: 0.9533
Desviación estándar: 0.0340


In [14]:
# 4. Evaluación Detallada

# Dividimos datos para prueba final (usando el mejor modelo encontrado)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Entrenar el mejor modelo con datos de train
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Matriz de Confusión
cm = confusion_matrix(y_test, y_pred)
print("--- Matriz de Confusión ---")
print(cm)
# Filas: Realidad, Columnas: Predicción

# Reporte de Clasificación (Precision, Recall, F1)
print("\n--- Reporte de Clasificación ---")
print(classification_report(y_test, y_pred, target_names=class_names))

NameError: name 'best_model' is not defined