In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_curve,auc

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('../data/haberman_csv.csv')

In [3]:
df.columns

Index(['Age_of_patient_at_time_of_operation', 'Patients_year_of_operation',
       'Number_of_positive_axillary_nodes_detected', 'Survival_status'],
      dtype='object')

In [4]:
df=df.drop(columns=['Patients_year_of_operation'])

In [5]:
df.shape

(306, 3)

In [6]:
df=df.rename(columns={'Age_of_patient_at_time_of_operation':'age', 'Number_of_positive_axillary_nodes_detected':'detections'})

In [7]:
data=df.values
X=data[:,:2]
Y=data[:,2]

In [8]:
X_train,x_test,Y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=9)

In [9]:
tree=DecisionTreeClassifier(random_state=9)

tree_params={'max_depth':[3,4,5,6],'max_features':[1,2]}

tree_grid=GridSearchCV(tree,tree_params,scoring='accuracy',n_jobs=-1,cv=10,verbose=2)

tree_grid.fit(X_train,Y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    1.9s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=9,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6], 'max_features': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_

In [10]:
import pydotplus
from sklearn.tree import export_graphviz

def draw_tree(tree,features,location):
    tree_string=export_graphviz(tree,feature_names=features,filled=True,out_file=None)
    graph=pydotplus.graph_from_dot_data(tree_string)
    graph.write_png(location)

In [11]:
draw_tree(tree_grid.best_estimator_,['age', 'detections'],\
          location='../imgs/tree_grid1.png')

<img src='../imgs/tree_grid1.png'>

In the above Decision Tree representation that is generated, blue colored leaves are for minority class(it could change on other representations).

values parameter represents number of samples of both classes.
Value=[0,4] meaning 0 sample from 1st class and 4 samples from 2nd class.

Blue colore leaves only contains samples from minority class. So, if we can extract rules that made these pure minority class regions in decision tree which i beleieve represents decsion bounderies in N-dimensional space also, then we have samples that are relatively represent pure regions as there are no samples from other class

<img src='../imgs/tree_grid1 - paths.png'>

In the above pictures red circled leaves contains samples from minority class and do not contain any sample from majority class. So, if we extract rules following the green line and replicate samples within those rules I believe it will fall in the same pure regions.

In [12]:
from sklearn.tree.export import export_text

Below  a sklearn provided rule extraction process is done.

In [13]:
rules=export_text(tree_grid.best_estimator_,feature_names=['age', 'detections'])

In [14]:
print(rules)

|--- detections <= 4.50
|   |--- age <= 69.50
|   |   |--- age <= 42.50
|   |   |   |--- detections <= 0.50
|   |   |   |   |--- age <= 35.50
|   |   |   |   |   |--- age <= 33.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  33.50
|   |   |   |   |   |   |--- class: 2
|   |   |   |   |--- age >  35.50
|   |   |   |   |   |--- age <= 38.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- age >  38.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- detections >  0.50
|   |   |   |   |--- class: 1
|   |   |--- age >  42.50
|   |   |   |--- age <= 43.50
|   |   |   |   |--- detections <= 1.00
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- detections >  1.00
|   |   |   |   |   |--- detections <= 2.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- detections >  2.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- age >  43.50
|   |   |   |   |--- detections <= 0.50
|   |   |   |   |   |--- age <= 50.50
|   |   |   

In [15]:
%matplotlib inline
tree_grid.best_estimator_.__getstate__()['tree_']

<sklearn.tree._tree.Tree at 0x59c5a0c2a0>

In [16]:
tree_grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=9, splitter='best')

In [17]:
clf=tree_grid.best_estimator_

In [18]:
type(clf.tree_)

sklearn.tree._tree.Tree

As sklearn provided rules extraction does not provide details about end leaves and their contents, I have also collected another code from which we can know what type of samples leaves have and we can then extract rules only for those that have 0 majority class samples. It is given below:

In [19]:
from sklearn.tree import _tree

In [24]:
def tree_to_code(tree, feature_names):# found this code online

    '''
    Outputs a decision tree model as a Python function
    
    Parameters:
    -----------
    tree: decision tree model
        The decision tree to represent as a function
    feature_names: list
        The feature names of the dataset used for building the decision tree
    '''

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print( "{}if {} <= {}:".format(indent, name, threshold) )
            recurse(tree_.children_left[node], depth + 1)
            print( "{}else:  # if {} > {}".format(indent, name, threshold) )
            recurse(tree_.children_right[node], depth + 1)
        else:
            print(  "{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

In [23]:
tree_to_code(clf,df.columns)

def tree(age, detections, Survival_status):
  if detections <= 4.5:
    if age <= 69.5:
      if age <= 42.5:
        if detections <= 0.5:
          if age <= 35.5:
            if age <= 33.5:
              return [[2. 0.]]
            else:  # if age > 33.5
              return [[0. 1.]]
          else:  # if age > 35.5
            if age <= 38.5:
              return [[5. 0.]]
            else:  # if age > 38.5
              return [[9. 2.]]
        else:  # if detections > 0.5
          return [[12.  0.]]
      else:  # if age > 42.5
        if age <= 43.5:
          if detections <= 1.0:
            return [[0. 2.]]
          else:  # if detections > 1.0
            if detections <= 2.5:
              return [[2. 1.]]
            else:  # if detections > 2.5
              return [[1. 0.]]
        else:  # if age > 43.5
          if detections <= 0.5:
            if age <= 50.5:
              return [[13.  4.]]
            else:  # if age > 50.5
              return [[45.  4.]]
   