# **Actividad: Clasificación de Tumores Cerebrales (Machine Learning Supervisado)**

# Preparación y carga de librerías



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

import matplotlib.pyplot as plt
!pip -q install xgboost lightgbm catboost



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Importación de librerías extras

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

# Modelos (sklearn)
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, AdaBoostClassifier,
    HistGradientBoostingClassifier
)
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.calibration import CalibratedClassifierCV

# Boosting externo
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


## Carga del dataset: brain_tumor_dataset.xlsx

In [None]:
DATA_PATH = "/content/drive/MyDrive/brain_tumor_dataset.xlsx"  # en Colab, súbelo al entorno o monta Drive
df = pd.read_excel(DATA_PATH)

print("Shape:", df.shape)
display(df.head())

Shape: (20000, 19)


Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


## Revisión preliminar (Análisis de Exploración de Datos EDA)

In [None]:
target = "MRI_Result"

print("\nDistribución de clases (conteos):")
print(df[target].value_counts())

print("\nDistribución de clases (proporciones):")
print(df[target].value_counts(normalize=True))

print("\nNulos por columna (top):")
print(df.isna().sum().sort_values(ascending=False).head(10))



Distribución de clases (conteos):
MRI_Result
Positive    10029
Negative     9971
Name: count, dtype: int64

Distribución de clases (proporciones):
MRI_Result
Positive    0.50145
Negative    0.49855
Name: proportion, dtype: float64

Nulos por columna (top):
Patient_ID    0
Age           0
Gender        0
Tumor_Type    0
Tumor_Size    0
Location      0
Histology     0
Stage         0
Symptom_1     0
Symptom_2     0
dtype: int64


## Eliminación del identificador y separación de X y Y

In [None]:
drop_cols = ["Patient_ID"]  # identificador, no aporta predicción y puede sesgar

X = df.drop(columns=[target] + drop_cols)
y = df[target].astype(str)

print("X shape:", X.shape, "| y shape:", y.shape)


X shape: (20000, 17) | y shape: (20000,)


## Partición: Training 50%, Validation 25% Test 25%

In [None]:
RANDOM_STATE = 42

# 50% train, 50% temporal
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.50,
    stratify=y,
    random_state=RANDOM_STATE
)

# temporal se divide en 25% val, 25% test (mitad y mitad del 50%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print("Train:", X_train.shape, y_train.value_counts(normalize=True).to_dict())
print("Val:  ", X_val.shape,   y_val.value_counts(normalize=True).to_dict())
print("Test: ", X_test.shape,  y_test.value_counts(normalize=True).to_dict())


Train: (10000, 17) {'Positive': 0.5014, 'Negative': 0.4986}
Val:   (5000, 17) {'Positive': 0.5014, 'Negative': 0.4986}
Test:  (5000, 17) {'Positive': 0.5016, 'Negative': 0.4984}


## Separación de variables independientes X y variable predictora Y.
Se elimina el identificador para eliminar posibles sesgos y evitar el aprendizaje de patrones irrelevantes.

In [None]:
drop_cols = ["Patient_ID"]  # identificador, no aporta predicción y puede sesgar

X = df.drop(columns=[target] + drop_cols)
y = df[target].astype(str)

print("X shape:", X.shape, "| y shape:", y.shape)


X shape: (20000, 17) | y shape: (20000,)


## Para aplicar un buen procesamiento, es necesario definir las columnas numéricas/categóricas + preprocesamiento

In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

print("Numéricas:", numeric_features)
print("Categóricas:", categorical_features)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


Numéricas: ['Age', 'Tumor_Size', 'Survival_Rate', 'Tumor_Growth_Rate']
Categóricas: ['Gender', 'Tumor_Type', 'Location', 'Histology', 'Stage', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Radiation_Treatment', 'Surgery_Performed', 'Chemotherapy', 'Family_History', 'Follow_Up_Required']


## Preparación de los datos para que los algoritmos puedan aprender de manera efectiva y sin sesgos (imputación + escalamiento + one-hot).

In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

POS_LABEL = "Positive"


## Modelos de Clasificación (con pipeline)

In [None]:
# Helper: ROC-AUC si hay probas o scores
def get_auc(model, X, y_true, pos_label=POS_LABEL):
    y_bin = (y_true == pos_label).astype(int)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X)
        pos_idx = list(model.classes_).index(pos_label)
        return roc_auc_score(y_bin, proba[:, pos_idx])

    if hasattr(model, "decision_function"):
        scores = model.decision_function(X)
        return roc_auc_score(y_bin, scores)

    return np.nan

def evaluate(pipe, X_tr, y_tr, X_va, y_va):
    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_va)

    return {
        "Accuracy": accuracy_score(y_va, pred),
        "Precision": precision_score(y_va, pred, pos_label=POS_LABEL),
        "Recall": recall_score(y_va, pred, pos_label=POS_LABEL),
        "F1": f1_score(y_va, pred, pos_label=POS_LABEL),
        "ROC_AUC": get_auc(pipe, X_va, y_va)
    }

models = {
    # Lineales
    "LogReg": LogisticRegression(max_iter=2000),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(loss="log_loss", max_iter=2000, tol=1e-3, random_state=RANDOM_STATE),
    "Perceptron": Perceptron(max_iter=2000, tol=1e-3, random_state=RANDOM_STATE),

    # Distancia
    "kNN": KNeighborsClassifier(n_neighbors=15),

    # SVM
    "SVM_RBF": SVC(kernel="rbf", probability=True),

    # Naive Bayes
    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),

    # Árboles y ensambles
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=RANDOM_STATE),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=RANDOM_STATE),

    # Boosting (sklearn)
    "AdaBoost": AdaBoostClassifier(random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=RANDOM_STATE),  # alternativa moderna

    # Discriminantes
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),

    # Gaussian Process (ojo: puede ser pesado; pero con 20k puede tardar. Aun así lo dejamos)
    # Lo calibramos para AUC si no da probas estables (ya da probas, pero es pesado por complejidad)
    "GaussianProcess": GaussianProcessClassifier(random_state=RANDOM_STATE),

    # Redes neuronales (MLP)
    "MLP": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=400, random_state=RANDOM_STATE),

    # Boosting moderno externo
    "XGBoost": XGBClassifier(
        n_estimators=500, max_depth=5, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric="logloss", random_state=RANDOM_STATE, n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=800, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE
    ),
    "CatBoost": CatBoostClassifier(
        iterations=800, learning_rate=0.05, depth=6,
        loss_function="Logloss", verbose=False, random_seed=RANDOM_STATE
    ),
}

pipes = {}
results = []

for name, clf in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", clf)])
    pipes[name] = pipe

    try:
        metrics = evaluate(pipe, X_train, y_train, X_val, y_val)
        results.append({"Model": name, **metrics})
        print(f"OK -> {name}")
    except Exception as e:
        print(f"SKIP -> {name} | Error: {e}")

results_df = pd.DataFrame(results).sort_values(["F1", "ROC_AUC"], ascending=False)
display(results_df)


OK -> LogReg
OK -> RidgeClassifier
OK -> SGDClassifier
OK -> Perceptron
OK -> kNN
OK -> SVM_RBF
OK -> GaussianNB
OK -> BernoulliNB
OK -> DecisionTree
OK -> RandomForest
OK -> ExtraTrees
OK -> AdaBoost
OK -> GradientBoosting
OK -> HistGradientBoosting
OK -> LDA




OK -> QDA
OK -> GaussianProcess




OK -> MLP
SKIP -> XGBoost | Error: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['Negative' 'Positive']
[LightGBM] [Info] Number of positive: 5014, number of negative: 4986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 902
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501400 -> initscore=0.005600
[LightGBM] [Info] Start training from score 0.005600




OK -> LightGBM
OK -> CatBoost


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC_AUC
11,AdaBoost,0.5026,0.503529,0.569206,0.534357,0.510691
12,GradientBoosting,0.5028,0.504046,0.521739,0.51274,0.500754
3,Perceptron,0.5086,0.509889,0.51416,0.512016,0.506939
17,MLP,0.5038,0.505058,0.51775,0.511326,0.506326
14,LDA,0.4918,0.493671,0.528919,0.510687,0.496034
1,RidgeClassifier,0.4918,0.493671,0.528919,0.510687,0.496029
0,LogReg,0.4918,0.493671,0.528919,0.510687,0.496025
5,SVM_RBF,0.4982,0.499617,0.520144,0.509674,0.497078
6,GaussianNB,0.4918,0.493502,0.514958,0.504002,0.497336
19,CatBoost,0.4992,0.500591,0.50698,0.503765,0.495886


## Elección del mejor modelo (por F1 y ROC-AUC) y prueba en TEST

In [None]:
best_model_name = results_df.iloc[0]["Model"]
best_pipe = pipes[best_model_name]

print("Mejor modelo en Validation:", best_model_name)

# Entrenar con Train + Val
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

best_pipe.fit(X_trainval, y_trainval)
test_pred = best_pipe.predict(X_test)

print("\n=== TEST ===")
print("Accuracy :", accuracy_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred, pos_label=POS_LABEL))
print("Recall   :", recall_score(y_test, test_pred, pos_label=POS_LABEL))
print("F1       :", f1_score(y_test, test_pred, pos_label=POS_LABEL))
print("ROC_AUC  :", get_auc(best_pipe, X_test, y_test))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_pred, labels=[POS_LABEL, "Negative"]))

print("\nClassification Report:")
print(classification_report(y_test, test_pred))


Mejor modelo en Validation: AdaBoost

=== TEST ===
Accuracy : 0.5004
Precision: 0.5030864197530864
Recall   : 0.32496012759170656
F1       : 0.3948643410852713
ROC_AUC  : 0.498489744534984

Confusion Matrix:
[[ 815 1693]
 [ 805 1687]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      0.68      0.57      2492
    Positive       0.50      0.32      0.39      2508

    accuracy                           0.50      5000
   macro avg       0.50      0.50      0.48      5000
weighted avg       0.50      0.50      0.48      5000



Guardado del mejor pipeline (para reproducibilidad / evidencia)

In [None]:
import joblib
joblib.dump(best_pipe, "best_brain_tumor_classifier.pkl")
print("Guardado: best_brain_tumor_classifier.pkl")

Guardado: best_brain_tumor_classifier.pkl


El modelo AdaBoost demostró ser el algoritmo más eficaz para predecir la variable MRI_Result en este conjunto de datos, superando a otros modelos lineales.

Para predicciones clínicas, el modelo puede servir como herramienta de apoyo diagnóstico, aunque debe considerarse como complemento y no reemplazo en la evaluación médica especializada.