# Actividad de práctica parcial 2

- Utiliza el dataset `Brain Tumor`, modela con SVC y haz Cross-Validation con kernel 'linear'
- Modela con Optimización Bayesiana ().
- El método de Cross-Validation es K-Folds con $k=10$.
- Utiliza el AUC como métrico de Cross-Validation.
- Compara resultados.

In [2]:
# Librerias
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler

In [5]:
# Cargar datos
data = pd.read_csv('brain_tumor_dataset.csv')
data.head()

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
0,1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
1,2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
2,3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
3,4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
4,5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_ID           20000 non-null  int64  
 1   Age                  20000 non-null  int64  
 2   Gender               20000 non-null  object 
 3   Tumor_Type           20000 non-null  object 
 4   Tumor_Size           20000 non-null  float64
 5   Location             20000 non-null  object 
 6   Histology            20000 non-null  object 
 7   Stage                20000 non-null  object 
 8   Symptom_1            20000 non-null  object 
 9   Symptom_2            20000 non-null  object 
 10  Symptom_3            20000 non-null  object 
 11  Radiation_Treatment  20000 non-null  object 
 12  Surgery_Performed    20000 non-null  object 
 13  Chemotherapy         20000 non-null  object 
 14  Survival_Rate        20000 non-null  float64
 15  Tumor_Growth_Rate    20000 non-null 

In [8]:
# Definir variables predictoras y objetivo
y = data['Tumor_Type']
X = data.drop(columns=['Tumor_Type'])

In [9]:
# Identificar variables numéricas y categóricas
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocesamiento: One-Hot Encoding para categóricas y escalado para numéricas
numerical_transformer = StandardScaler()
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('num', numerical_transformer, num_features)
    ]
)

In [10]:
# Modelado con SVC y kernel 'linear'
SVC_model = SVC(kernel='linear', probability=True, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('SVC', SVC_model)
])

In [11]:
# Cross-validation estratificada y AUC
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

auc = make_scorer(roc_auc_score, needs_proba=True, greater_is_better=True)

# Probar
scores = cross_val_score(pipeline, X, y, scoring=auc, cv=cv, n_jobs=-1)

print("AUC por fold:", scores)
print("AUC promedio:", scores.mean())



AUC por fold: [0.5114271  0.51744366 0.50099351 0.50531355 0.50830257 0.5
 0.50542555 0.50214952 0.49549096 0.5       ]
AUC promedio: 0.504654641891777


In [12]:
# Definir función objetivo
def obj(logC):
    C = 10**logC
    SVC_model = SVC(kernel='linear', probability=True, random_state=42, C=C)
    pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('SVC', SVC_model)
])
    scores = cross_val_score(pipeline, X, y, scoring=auc, cv=cv, n_jobs=-1)
    return scores.mean()


In [None]:
#%pip install bayesian_optimization

Collecting bayesian_optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Downloading bayesian_optimization-3.1.0-py3-none-any.whl (36 kB)
Installing collected packages: bayesian_optimization
Successfully installed bayesian_optimization-3.1.0
Note: you may need to restart the kernel to use updated packages.
