In [25]:
import pandas as pd

# Cargar el archivo CSV
file_path = "car_evaluation.csv"
df = pd.read_csv(file_path)

# Mostrar las primeras filas
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [26]:
# Definir nombres de columnas según el dataset original
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]

# Recargar el archivo con nombres de columna correctos
df = pd.read_csv(file_path, names=column_names)

# Mostrar las primeras filas corregidas
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [28]:
# Verificar valores nulos
print("Valores nulos por columna:")
print(df.isnull().sum())

# Verificar filas duplicadas
print("\nNúmero de filas duplicadas:")
print(df.duplicated().sum())

Valores nulos por columna:
buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

Número de filas duplicadas:
0


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Codificar variables categóricas a valores numéricos
df_encoded = df.apply(LabelEncoder().fit_transform)

# Separar características (X) y variable objetivo (y)
X = df_encoded.drop(columns=["safety"])
y = df_encoded["safety"]

# Dividir en conjunto de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verificar dimensiones de los conjuntos
X_train.shape, X_test.shape

((1382, 6), (346, 6))

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Binarizar las etiquetas para calcular AUC en clasificación multiclase
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])

# Modelo 1: Árbol de decisión sin hiperparámetros
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_train, y_train)
y_pred_proba1 = clf1.predict_proba(X_test)
auc1 = roc_auc_score(y_test_binarized, y_pred_proba1, multi_class="ovr")

# Modelo 2: Árbol de decisión con número máximo de hojas
clf2 = DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)
clf2.fit(X_train, y_train)
y_pred_proba2 = clf2.predict_proba(X_test)
auc2 = roc_auc_score(y_test_binarized, y_pred_proba2, multi_class="ovr")

# Modelo 3: Árbol de decisión con número mínimo de muestras por hoja
clf3 = DecisionTreeClassifier(min_samples_leaf=10, random_state=42)
clf3.fit(X_train, y_train)
y_pred_proba3 = clf3.predict_proba(X_test)
auc3 = roc_auc_score(y_test_binarized, y_pred_proba3, multi_class="ovr")

auc1, auc2, auc3

(np.float64(0.4717867527275324),
 np.float64(0.736255843939502),
 np.float64(0.6818375173884919))