In [35]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import model_selection

In [29]:
data = arff.loadarff('breast.w.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True)
df.replace(b'benign', 0, inplace=True)
df.replace(b'malignant', 1, inplace=True)

data = df.drop(["Class"],axis=1).values
target = df["Class"].values

# Training the decision tree models

In [40]:
fold = model_selection.KFold(n_splits=10, shuffle=True, random_state=9)
features_mean_scores = []
max_depth_mean_scores = []

for n_features in (1, 3, 5, 9):
    scores = []
    for train_filter, test_filter in fold.split(data):
        X_train, X_test, y_train, y_test = data[train_filter], data[test_filter], target[train_filter], target[test_filter]
        clf = DecisionTreeClassifier(max_features=n_features)
        # fit the classifier to the training data
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores.append(accuracy_score(y_test, predictions))

    mean_score = np.array(scores).mean()
    features_mean_scores.append(mean_score)
    print("Features =", n_features, "-", mean_score)

for max_depth in (1, 3, 5, 9):
    scores = []
    for train_filter, test_filter in fold.split(data):
        X_train, X_test, y_train, y_test = data[train_filter], data[test_filter], target[train_filter], target[test_filter]
        clf = DecisionTreeClassifier(max_depth=max_depth)
        # fit the classifier to the training data
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores.append(accuracy_score(y_test, predictions))

    mean_score = np.array(scores).mean()
    max_depth_mean_scores.append(mean_score)
    print("Max depth =", max_depth, "-", mean_score)

Features = 1 - 0.9545183290707587
Features = 3 - 0.9531116794543906
Features = 5 - 0.9575660699062235
Features = 9 - 0.9545609548167093
Max depth = 1 - 0.9165174765558397
Max depth = 3 - 0.9501705029838023
Max depth = 5 - 0.9574808184143222
Max depth = 9 - 0.9560315430520034


In [48]:
import plotly.express as px
fig = px.line(x=(1, 3, 5, 9), y=features_mean_scores, title="Average Accuracy per Number of Features")
fig.show()

fig = px.line(x=(1, 3, 5, 9), y=max_depth_mean_scores, title="Average Accuracy per Max depth bruh")
fig.show()