In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [9]:
train_df = pd.read_excel('../Data/train.xlsx')
validation_df = pd.read_excel('../Data/validation.xlsx')
test_df = pd.read_excel('../Data/test.xlsx')

In [10]:

y_train = train_df['Treatment_Type']
X_train = train_df.drop(columns=['Treatment_Type'])
y_val = validation_df['Treatment_Type']
X_val = validation_df.drop(columns=['Treatment_Type'])

for col in X_train.columns:
    if pd.api.types.is_datetime64_any_dtype(X_train[col]):
        X_train = X_train.drop(columns=[col])
        X_val = X_val.drop(columns=[col])


In [11]:
depths = [5, 10]
criterions = ['gini', 'entropy']
min_samples = [2, 5]

results = []

In [12]:

# Entrenar modelos con distintas combinaciones de hiperparámetros
for depth in depths:
    for crit in criterions:
        for min_split in min_samples:
            model = DecisionTreeClassifier(max_depth=depth, criterion=crit, min_samples_split=min_split, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            acc = accuracy_score(y_val, y_pred)
            results.append({
                'max_depth': depth,
                'criterion': crit,
                'min_samples_split': min_split,
                'accuracy': acc
            })

In [13]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='accuracy', ascending=False).reset_index(drop=True)
print("Resultados del experimento con Árbol de Decisión:")
print(results_df)

Resultados del experimento con Árbol de Decisión:
   max_depth criterion  min_samples_split  accuracy
0          5   entropy                  5  0.245625
1          5   entropy                  2  0.245625
2          5      gini                  5  0.240000
3          5      gini                  2  0.239375
4         10      gini                  5  0.239375
5         10   entropy                  5  0.239375
6         10      gini                  2  0.238125
7         10   entropy                  2  0.232500


In [14]:
print("\nMejor configuración:")
print(results_df.iloc[0])



Mejor configuración:
max_depth                   5
criterion             entropy
min_samples_split           5
accuracy             0.245625
Name: 0, dtype: object


In [16]:
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])
X_test = test_df.drop(columns=["Treatment_Type"])
y_test = test_df["Treatment_Type"]

In [17]:
for col in X_test.columns:
    if pd.api.types.is_datetime64_any_dtype(X_test[col]):
        X_test = X_test.drop(columns=[col])
        if col in X_full.columns:
            X_full = X_full.drop(columns=[col])

In [18]:
imputer = SimpleImputer(strategy='mean')
X_full = pd.DataFrame(imputer.fit_transform(X_full), columns=X_full.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [19]:
best_tree = DecisionTreeClassifier(max_depth=5, criterion="gini", min_samples_split=2, random_state=42)
best_tree.fit(X_full, y_full)
y_pred = best_tree.predict(X_test)

In [20]:
print("Accuracy (Árbol de Decisión) en test:", accuracy_score(y_test, y_pred))
print("Reporte de clasificación:\n", classification_report(y_test, y_pred))

Accuracy (Árbol de Decisión) en test: 0.2425
Reporte de clasificación:
                precision    recall  f1-score   support

 Chemotherapy       0.12      0.01      0.02       499
Immunotherapy       0.24      0.11      0.15       511
    Radiation       0.24      0.69      0.36       500
      Surgery       0.25      0.17      0.20       490

     accuracy                           0.24      2000
    macro avg       0.22      0.24      0.18      2000
 weighted avg       0.22      0.24      0.18      2000

