In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.tree import plot_tree

In [None]:
train = pd.read_csv("./data/feature_selected/train_gearbox_classif.csv", sep=",")
test = pd.read_csv("./data/feature_selected/test_gearbox_classif.csv", sep=",")

In [None]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
drop_cols = [reg_target_name, class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0", "index_y"]

X_train = train.drop(columns=drop_cols)
y_train = train[class_target_name]
X_test = test.drop(columns=drop_cols)
y_test = test[class_target_name]

In [None]:
X_train.head()

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
base_estimator = DecisionTreeClassifier(random_state=0)
param_grid = {
    "max_depth": [5, 10, 20],#scipy.stats.randint(5, 60),
    "min_samples_leaf": [1, 10, 50, 100],#scipy.stats.randint(1, 100),
}
sh = GridSearchCV(base_estimator, param_grid, scoring="neg_mean_squared_error").fit(X_train, y_train)
clf = sh.best_estimator_
report(sh.cv_results_)

In [None]:
plt.xticks(rotation=90)
plt.bar(X_test.columns, clf.feature_importances_)

In [None]:
@interact_manual
def understand_decisions(column=X_test.columns, datapoints=100):
    sample = X_test.sample()
    min = int(X_test[column].min())
    max = int(X_test[column].max())
    scaler = (max - min) / datapoints
    predictions = []
    for i in range(datapoints):
        sample[column] = i * scaler + min
        predictions.append(clf.predict(sample))
    plt.scatter(np.arange(min, max, scaler), predictions)
    

In [None]:
def plot_decision_tree(model, feature_names, class_names):
    # plot_tree function contains a list of all nodes and leaves of the Decision tree
    tree = plot_tree(model, feature_names = feature_names, class_names = class_names,
                     rounded = True, proportion = True, precision = 2, filled = True, fontsize=10)
    
    # I return the tree for the next part
    return tree

In [None]:
def plot_decision_path_tree(model, X, class_names=[]):
    fig = plt.figure(figsize=(10, 10))
    feature_names = X.index if type(X) == type(pd.Series()) else X.columns
    
    # Getting the tree from the function programmed above
    tree = plot_decision_tree(model, feature_names, class_names)
    
    # Get the decision path of the wanted prediction 
    decision_path = model.decision_path(X)

    # Now remember the tree object contains all nodes and leaves so the logic here
    # is to loop into the tree and change visible attribute for components that 
    # are not in the decision path
    for i in range(0,len(tree)):
        if i not in decision_path.indices:
            plt.setp(tree[i],visible=False)

    plt.show()

In [None]:
@interact_manual
def plot_tree_paths(column=X_test.columns, datapoint=(1, 100, 1)):
    sample = X_test.sample()
    min = int(X_test[column].min())
    max = int(X_test[column].max())
    new_value = min + (datapoint / 100) * (max - min)
    print("feature: {}, with value: {}".format(column, new_value))
    sample[column] = new_value
    plot_decision_path_tree(clf, sample, class_names=["RUL (Target)"])