In [1]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, make_scorer, roc_curve, auc
from sklearn.pipeline import Pipeline
from skopt.space import Real, Categorical
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import statsmodels.api as sm
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [2]:
df = pd.read_csv("brain_tumor_dataset.csv")
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


In [3]:
df2 = df.dropna().drop_duplicates()

In [4]:
df3 = pd.get_dummies(df2, columns=["Gender", "Location", "Histology", "Stage", "Symptom_1", "Symptom_2", "Symptom_3", "Radiation_Treatment", "Surgery_Performed", "Chemotherapy", "Family_History", "MRI_Result", "Follow_Up_Required"], drop_first=True)

In [5]:
X = df3.drop(columns=["Tumor_Type"], axis=1)
y = df3["Tumor_Type"].map({'Malignant': 1, 'Benign': 0})

In [6]:
x = sm.add_constant(X)

ols = sm.OLS(y, X)
results = ols.fit()
results.summary()

0,1,2,3
Dep. Variable:,Tumor_Type,R-squared (uncentered):,0.494
Model:,OLS,Adj. R-squared (uncentered):,0.493
Method:,Least Squares,F-statistic:,650.3
Date:,"Mon, 27 Oct 2025",Prob (F-statistic):,0.0
Time:,20:47:57,Log-Likelihood:,-14662.0
No. Observations:,20000,AIC:,29380.0
Df Residuals:,19970,BIC:,29620.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Patient_ID,2.422e-06,6.02e-07,4.021,0.000,1.24e-06,3.6e-06
Age,0.0016,0.000,8.279,0.000,0.001,0.002
Tumor_Size,0.0033,0.001,2.589,0.010,0.001,0.006
Survival_Rate,0.0019,0.000,10.601,0.000,0.002,0.002
Tumor_Growth_Rate,0.0177,0.004,4.280,0.000,0.010,0.026
Gender_Male,0.0118,0.007,1.670,0.095,-0.002,0.026
Location_Occipital,0.0281,0.010,2.814,0.005,0.009,0.048
Location_Parietal,0.0392,0.010,3.943,0.000,0.020,0.059
Location_Temporal,0.0495,0.010,5.005,0.000,0.030,0.069

0,1,2,3
Omnibus:,71303.923,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3108.29
Skew:,-0.006,Prob(JB):,0.0
Kurtosis:,1.069,Cond. No.,46800.0


okay voy a eliminar varias considerando este análisis OLS

In [7]:
X = df3[["Patient_ID", "Age", "Tumor_Size", "Survival_Rate", "Tumor_Growth_Rate", "Location_Occipital", "Location_Parietal", "Location_Temporal", "Histology_Glioblastoma", "Histology_Medulloblastoma", "Histology_Meningioma", "Stage_IV", "Symptom_1_Nausea", "Symptom_3_Vision Issues", "Radiation_Treatment_Yes", "Chemotherapy_Yes"]]
y = df3["Tumor_Type"].map({'Malignant': 1, 'Benign': 0})

In [8]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)

In [9]:
scoring = "roc_auc_ovr"
kernels = ['linear']
svc_scores = {}

In [10]:
from sklearn.model_selection import train_test_split

X_small, _, y_small, _ = train_test_split(X, y, train_size=0.2, stratify=y, random_state=42)

In [11]:
np.random.seed(40)

# --- 1. Función objetivo ---
def objective(C):
    base_model = LinearSVC(C=C, random_state=42, max_iter=5000)
    # Para poder obtener probabilidades (equivalente a probability=True):
    model = make_pipeline(StandardScaler(), CalibratedClassifierCV(base_model, cv=3))
    scores = cross_val_score(model, X_small, y_small, cv=kfold, scoring=scoring, n_jobs=-1)
    return scores.mean()

# --- 2a. Puntos iniciales ---
X_params = np.random.uniform(0.01, 100, size=(3, 1))  # solo C
y_auc = np.array([objective(p[0]) for p in X_params]).reshape(-1, 1)

# --- 2b. Gaussian Process ---
kernel = 1.0 * RBF(length_scale=1.0)
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=1e-6)

x1 = np.linspace(0.01, 100, 30).reshape(-1, 1)  # rango de C

# --- 2c. Iteraciones de optimización bayesiana ---
for i in range(7):
    gp.fit(X_params, y_auc)
    y_pred = gp.predict(x1)
    idx_max = np.argmax(y_pred)
    best_C = x1[idx_max][0]
    # Evaluar AUC real
    y_real = objective(best_C)
    # Agregar punto al dataset
    X_params = np.vstack([X_params, [best_C]])
    y_auc = np.vstack([y_auc, [[y_real]]])

# --- 3. Mejor punto encontrado ---
best_idx = np.argmax(y_auc)
C_best = X_params[best_idx][0]
AUC_best = y_auc[best_idx][0]
print(f"Mejor C: {C_best:.4f}, AUC: {AUC_best:.4f}")

Mejor C: 100.0000, AUC: 0.5162


Esto fue lo máximo que logré obtener. Tuve que hacer algunas modificaciones en el código, como reducir el for, disminuir el rango de C a 30 valores, y modifiqué el uso del kernel, en lugar de SVC(kernel='linear') a LinearSVC. 