In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the car dataset
data = np.genfromtxt('car.data', delimiter=',', dtype=str)

In [3]:
# Extract features and labels
X = data[:, :-1]
y = data[:, -1]

# Encode categorical features
label_encoders = {}
for i in range(X.shape[1]):
    label_encoders[i] = LabelEncoder()
    X[:, i] = label_encoders[i].fit_transform(X[:, i])

# Define class labels and attribute values
class_labels = ['unacc', 'acc', 'good', 'vgood']

In [4]:
# Function to split data into training and testing sets
def split_data(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
    return X_train, X_test, y_train, y_test

In [5]:

# Function to construct decision tree and evaluate performance
def evaluate_tree(X_train, X_test, y_train, y_test, criterian):
    # Construct decision tree
    tree = DecisionTreeClassifier(criterion=criterian)
    tree.fit(X_train, y_train)
    
    # Predict labels for test set
    y_pred = tree.predict(X_test)
    
    # Calculate confusion matrix and F-score
    cm = confusion_matrix(y_test, y_pred, labels=class_labels)
    f_score = f1_score(y_test, y_pred, average='weighted')
    
    # Calculate accuracy
    accuracy = np.sum(np.diag(cm)) / np.sum(cm)
    
    return accuracy, cm, f_score

In [6]:
# Define parameters
test_size = 0.4
X_train, X_test, y_train, y_test = split_data(X, y, test_size)
# Perform experiment with entropy
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'entropy')
print("Accuracy:", accuacy)
print("Confusion Matrix:\n", cm)
print("F1 score:", f_score)

Accuracy: 0.9797687861271677
Confusion Matrix:
 [[481   3   0   0]
 [  1 147   5   1]
 [  0   4  24   0]
 [  0   0   0  26]]
F1 score: 0.9798905210172479


In [7]:
# Function to repeat experiment multiple times
def repeat_experiment(X, y, test_size, num_repeats, criterian):
    accuracies = []
    for _ in range(num_repeats):
        X_train, X_test, y_train, y_test = split_data(X, y, test_size)
        accuracy, _, _ = evaluate_tree(X_train, X_test, y_train, y_test, criterian)
        accuracies.append(accuracy)
    avg_accuracy = np.mean(accuracies)
    return avg_accuracy

In [8]:
# Define parameters
test_size = 0.4
num_repeats = 20

# Perform experiment with entropy
avg_accuracy = repeat_experiment(X, y, test_size, num_repeats, 'entropy')
print("Average accuracy with entropy:", avg_accuracy)

Average accuracy with entropy: 0.9704479768786125


In [9]:
# Perform experiment with gini
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'gini')
print("Accuracy with gini:", accuacy)
print("Confusion Matrix with gini:\n", cm)
print("F1 score with gini:", f_score)

Accuracy with gini: 0.9638728323699421
Confusion Matrix with gini:
 [[474   8   2   0]
 [  6 142   5   1]
 [  0   3  25   0]
 [  0   0   0  26]]
F1 score with gini: 0.9642660592362181


In [10]:
avg_accuracy_gini = repeat_experiment(X, y, test_size, num_repeats, 'gini')
print("Average accuracy with Gini index:", avg_accuracy_gini)

Average accuracy with Gini index: 0.9696531791907512


In [11]:
# Define parameters
test_size = 0.3
num_repeats = 20

# Perform experiment with entropy
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'entropy')
print("Accuracy:", accuacy)
print("Confusion Matrix:\n", cm)
print("F1 score:", f_score)
avg_accuracy = repeat_experiment(X, y, test_size, num_repeats, 'entropy')
print("Average accuracy with entropy:", avg_accuracy)

# Perform experiment with gini
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'gini')
print("Accuracy with gini:", accuacy)
print("Confusion Matrix with gini:\n", cm)
print("F1 score with gini:", f_score)
avg_accuracy_gini = repeat_experiment(X, y, test_size, num_repeats, 'gini')
print("Average accuracy with Gini index:", avg_accuracy_gini)

Accuracy: 0.9710982658959537
Confusion Matrix:
 [[479   5   0   0]
 [  3 144   6   1]
 [  0   5  23   0]
 [  0   0   0  26]]
F1 score: 0.9712393693884904
Average accuracy with entropy: 0.9766859344894028
Accuracy with gini: 0.9638728323699421
Confusion Matrix with gini:
 [[474   8   2   0]
 [  6 142   5   1]
 [  0   3  25   0]
 [  0   0   0  26]]
F1 score with gini: 0.9642660592362181
Average accuracy with Gini index: 0.9712909441233141


In [12]:
# Define parameters
test_size = 0.2
num_repeats = 20

# Perform experiment with entropy
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'entropy')
print("Accuracy:", accuacy)
print("Confusion Matrix:\n", cm)
print("F1 score:", f_score)
avg_accuracy = repeat_experiment(X, y, test_size, num_repeats, 'entropy')
print("Average accuracy with entropy:", avg_accuracy)

# Perform experiment with gini
accuacy, cm, f_score = evaluate_tree(X_train, X_test, y_train, y_test, 'gini')
print("Accuracy with gini:", accuacy)
print("Confusion Matrix with gini:\n", cm)
print("F1 score with gini:", f_score)
avg_accuracy_gini = repeat_experiment(X, y, test_size, num_repeats, 'gini')
print("Average accuracy with Gini index:", avg_accuracy_gini)

Accuracy: 0.9739884393063584
Confusion Matrix:
 [[479   5   0   0]
 [  3 145   5   1]
 [  0   4  24   0]
 [  0   0   0  26]]
F1 score: 0.9741041904005581
Average accuracy with entropy: 0.9812138728323699
Accuracy with gini: 0.9696531791907514
Confusion Matrix with gini:
 [[476   6   2   0]
 [  4 144   5   1]
 [  0   3  25   0]
 [  0   0   0  26]]
F1 score with gini: 0.9700678127234541
Average accuracy with Gini index: 0.9777456647398843
