## **EDA**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

In [None]:
data = pd.read_csv(r"C:\Users\dayan\Downloads\brain_tumor_dataset.csv")
data.head()

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


In [None]:
# Dropear nulos
data = data.dropna()
# Dropear duplicados
data = data.drop_duplicates()

X = data.drop(columns=["Patient_ID", "Tumor_Type"])
y = data["Tumor_Type"]

# Convertir 'Malignant'/'Benign' 
le = LabelEncoder()
y_processed = le.fit_transform(y)

In [None]:
# Convertir 'Malignant'/'Benign' 
le = LabelEncoder()
y_processed = le.fit_transform(y)

In [None]:
# Columnas numéricas continuas
numeric_features = ["Age", "Tumor_Size", "Survival_Rate", "Tumor_Growth_Rate"]

# Columnas categóricas nominales (múltiples categorías sin orden)
categorical_features = [
    "Location",
    "Histology",
    "Symptom_1",
    "Symptom_2",
    "Symptom_3",
]

# Columnas categóricas binarias (Sí/No, Hombre/Mujer, etc.)
binary_features = [
    "Gender",
    "Radiation_Treatment",
    "Surgery_Performed",
    "Chemotherapy",
    "Family_History",
    "MRI_Result",
    "Follow_Up_Required",
]

# Columnas categóricas ordinales (tienen un orden lógico)
ordinal_features = ["Stage"]
# Definimos el orden específico para 'Stage'
stage_order = ["I", "II", "III", "IV"]

In [None]:

# variables numéricas:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# variables categóricas (múltiples clases):
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# variables binarias:
binary_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False))
])

# Pipeline para variables ordinales:
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[stage_order], handle_unknown='use_encoded_value', unknown_value=-1))
])


# Combinar todos los pipelines en un solo 'ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_transformer, binary_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
)

In [None]:
X_processed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

# Crear el DataFrame de características (X) procesadas
final_cleaned_df = pd.DataFrame(X_processed, columns=feature_names)

final_cleaned_df['Target_Encoded'] = y_processed
final_cleaned_df.to_csv(output_csv, index=False)


In [None]:
def caja_negra(C_value):
    
    
    if C_value <= 0.0001:
        return 0.0 
    
    svc_model = SVC(kernel='linear', C=C_value, probability=True, random_state=10)
    
    try:
        scores = cross_val_score(svc_model, X_processed, y_processed, scoring='roc_auc', cv=10)
        meanscore = np.mean(scores)
    except ValueError:
        meanscore = 0.0 

    print("  Probando C=" + str(round(C_value, 4)) + " -> AUC: " + str(round(meanscore, 4)))
    
    return meanscore

# Límites para 'C' (buscamos un float entre 0.01 y 100)
bounds = np.array([[0.01, 100.0]])

n_initial = 3
X_train = []
y_train = []

print("\n" + "Evaluando puntos iniciales (aleatorios)")
for _ in range(n_initial):
    c_sample = np.random.uniform(bounds[0,0], bounds[0,1])
    score = caja_negra(c_sample)
    
    X_train.append([c_sample]) 
    y_train.append(score)

X_train = np.array(X_train)
y_train = np.array(y_train)


# Configurar el Proceso Gaussiano (GP)
kernel = RBF() 
gp = GaussianProcessRegressor(kernel=kernel, random_state=42)

# Definir la Función de Adquisición (UCB)
def upper_confidence_bound(mu, std, kappa=1.5):
    return mu + kappa * std

# Haremos 10 nuevas evaluaciones
n_iterations = 10 

print("\n" + "Iniciando Optimización Bayesiana")

for i in range(n_iterations):
    print("\n" + "Iteracion " + str(i + 1) + "/" + str(n_iterations))
    
    # 1. Ajustar el GP
    gp.fit(X_train, y_train)
    
    # 2. Generar 1000 "puntos candidatos" de C
    candidates_c = np.random.uniform(bounds[0, 0], bounds[0, 1], size=1000)
    X_candidates = candidates_c.reshape(-1, 1) # Convertir a 2D

    # 3. Predecir (leer el "mapa")
    mu, std = gp.predict(X_candidates, return_std=True)
    
    # 4. Calcular UCB (la "estrategia")
    ucb_scores = upper_confidence_bound(mu, std, kappa=1.5)
    
    # 5. Encontrar el mejor candidato
    next_point_index = np.argmax(ucb_scores)
    next_x = X_candidates[next_point_index]
    
    # 6. Evaluar la caja negra 
    next_y = caja_negra(next_x[0])
    
    # 7. Añadir el nuevo punto
    X_train = np.vstack([X_train, next_x])
    y_train = np.append(y_train, next_y)


best_index = np.argmax(y_train)
best_auc = y_train[best_index]
best_hyperparam_C = X_train[best_index][0] 

print("\n" + "Optimización Terminada")

best_auc_formatted = round(best_auc, 4)
best_C_formatted = round(best_hyperparam_C, 4)

print("Mejor AUC encontrado: " + str(best_auc_formatted))
print("Mejor Hiperparámetro 'C': " + str(best_C_formatted))


--- Evaluando puntos iniciales (aleatorios) ---
