In [272]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import plotly.graph_objects as go
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [273]:
data = arff.loadarff('breast.w.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True)
df.replace(b'benign', 0, inplace=True)
df.replace(b'malignant', 1, inplace=True)

data = df.drop(["Class"],axis=1).values
target = df["Class"].values

# Training the decision tree models

In [274]:
fold = model_selection.KFold(n_splits=10, shuffle=True, random_state=9)
features_mean_scores = [], []
max_depth_mean_scores = [], []

for n_features in (1, 3, 5, 9):
    scores = [], []
    for train_filter, test_filter in fold.split(data):
        X_train, X_test, y_train, y_test = data[train_filter], data[test_filter], target[train_filter], target[test_filter]

        select = SelectKBest(mutual_info_classif, k=n_features).fit(X_train, y_train)
        X_train, X_test = select.transform(X_train), select.transform(X_test)
        
        clf = DecisionTreeClassifier(random_state=9)
        # fit the classifier to the training data
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores[0].append(accuracy_score(y_test, predictions))
        predictions = clf.predict(X_train)
        scores[1].append(accuracy_score(y_train, predictions))

    mean_score = np.array(scores[0]).mean()
    features_mean_scores[0].append(mean_score)
    print("Features =", n_features, "-", mean_score)
    mean_score = np.array(scores[1]).mean()
    features_mean_scores[1].append(mean_score)
    print("Features =", n_features, "-", mean_score)

for max_depth in (1, 3, 5, 9):
    scores = [], []
    for train_filter, test_filter in fold.split(data):
        X_train, X_test, y_train, y_test = data[train_filter], data[test_filter], target[train_filter], target[test_filter]
        clf = DecisionTreeClassifier(max_depth=max_depth, random_state=9)
        # fit the classifier to the training data
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores[0].append(accuracy_score(y_test, predictions))
        predictions = clf.predict(X_train)
        scores[1].append(accuracy_score(y_train, predictions))

    mean_score = np.array(scores[0]).mean()
    max_depth_mean_scores[0].append(mean_score)
    print("Max depth =", max_depth, "-", mean_score)
    mean_score = np.array(scores[1]).mean()
    max_depth_mean_scores[1].append(mean_score)
    print("Max depth =", max_depth, "-", mean_score)

Features = 1 - 0.9238704177323104
Features = 1 - 0.9284205926749822
Features = 3 - 0.9458866155157715
Features = 3 - 0.9889380577844866
Features = 5 - 0.9546248934356351
Features = 5 - 1.0
Features = 9 - 0.9560741687979538
Features = 9 - 1.0
Max depth = 1 - 0.9165174765558397
Max depth = 1 - 0.9292333359815684
Max depth = 3 - 0.9486999147485081
Max depth = 3 - 0.9653481104843621
Max depth = 5 - 0.9575021312872976
Max depth = 5 - 0.98568337702921
Max depth = 9 - 0.951705029838022
Max depth = 9 - 0.9996745319244724


In [275]:
fig = go.Figure()
fig.add_scatter(x=(1, 3, 5, 9), y=features_mean_scores[0], name="Dados de teste")
fig.add_scatter(x=(1, 3, 5, 9), y=features_mean_scores[1], name="Dados de treino")
fig.update_layout(width=500, height=400, title_text="Eficácia média por número de features selecionado")
fig.show()

fig = go.Figure()
fig.add_scatter(x=(1, 3, 5, 9), y=max_depth_mean_scores[0], name="Dados de teste")
fig.add_scatter(x=(1, 3, 5, 9), y=max_depth_mean_scores[1], name="Dados de treino")
fig.update_layout(width=500, height=400, title_text="Eficácia média por profundidade máxima da árvore")
fig.show()