In [71]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from six import StringIO
from IPython.display import Image
import pydotplus
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
import graphviz
from sklearn import tree
from sklearn.tree import _tree

%matplotlib inline

In [3]:
# Load Dataset

abs_path = Path.cwd()
data_address = abs_path.parent.parent/ 'data/' / 'Cleaned_US.csv'
df = pd.read_csv(data_address)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774575 entries, 0 to 2774574
Data columns (total 34 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Start_Time         object 
 2   Start_Lat          float64
 3   Start_Lng          float64
 4   End_Lat            float64
 5   End_Lng            float64
 6   Distance(mi)       float64
 7   City               object 
 8   County             object 
 9   State              object 
 10  Temperature(F)     float64
 11  Wind_Chill(F)      float64
 12  Humidity(%)        float64
 13  Pressure(in)       float64
 14  Visibility(mi)     float64
 15  Wind_Speed(mph)    float64
 16  Precipitation(in)  float64
 17  Weather_Condition  object 
 18  Amenity            bool   
 19  Bump               bool   
 20  Crossing           bool   
 21  Give_Way           bool   
 22  Junction           bool   
 23  No_Exit            bool   
 24  Railway            bool   
 25  Roundabout        

In [5]:
features = ['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
            'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
             'Distance(mi)', 'Amenity', 'Bump', 'Crossing',
            'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
            'Stop', 'Traffic_Calming', 'Traffic_Signal']

In [6]:
X = df[features]
y = df['Severity']

In [7]:
y.value_counts()

2    2470962
3     151145
4     126766
1      25702
Name: Severity, dtype: int64

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774575 entries, 0 to 2774574
Data columns (total 20 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Temperature(F)     float64
 1   Wind_Chill(F)      float64
 2   Humidity(%)        float64
 3   Pressure(in)       float64
 4   Visibility(mi)     float64
 5   Wind_Speed(mph)    float64
 6   Precipitation(in)  float64
 7   Distance(mi)       float64
 8   Amenity            bool   
 9   Bump               bool   
 10  Crossing           bool   
 11  Give_Way           bool   
 12  Junction           bool   
 13  No_Exit            bool   
 14  Railway            bool   
 15  Roundabout         bool   
 16  Station            bool   
 17  Stop               bool   
 18  Traffic_Calming    bool   
 19  Traffic_Signal     bool   
dtypes: bool(12), float64(8)
memory usage: 201.1 MB


In [9]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [10]:
dtree = DecisionTreeClassifier()

In [11]:
dtree = dtree.fit(X_train, y_train)

In [12]:
y_pred = dtree.predict(X_test)

In [13]:
# Accuracy, how often classfier predict correct?
metrics.accuracy_score(y_test, y_pred)

0.847279007675407

In [21]:
dtree.feature_importances_

array([1.18233304e-01, 7.59869952e-02, 1.52882235e-01, 1.97538556e-01,
       3.47263982e-02, 1.04958952e-01, 5.89228776e-02, 2.19230300e-01,
       2.43134850e-03, 5.44745451e-05, 8.69913259e-03, 1.27047809e-03,
       7.28712048e-03, 5.69963902e-04, 2.24666312e-03, 2.89876847e-05,
       3.57344905e-03, 3.67616303e-03, 1.92463170e-04, 7.49013827e-03])

In [22]:
dtree_opt = DecisionTreeClassifier(criterion='entropy', max_depth=15)

In [23]:
dtree_opt = dtree_opt.fit(X_train, y_train)

In [24]:
y_pred = dtree_opt.predict(X_test)

In [25]:
metrics.accuracy_score(y_test, y_pred)

0.8930186089694425

In [26]:
dtree_opt.feature_importances_

array([4.10028321e-02, 1.00041933e-01, 6.51000422e-02, 1.34065417e-01,
       1.70384875e-02, 4.36001616e-02, 2.51567300e-01, 2.83523670e-01,
       1.21164649e-03, 5.68513632e-05, 2.14861722e-02, 5.14332771e-04,
       1.60195359e-02, 2.48603564e-04, 1.07020703e-03, 0.00000000e+00,
       2.45846446e-03, 2.16074774e-03, 1.65816570e-04, 1.86677777e-02])

In [35]:
leaf_nodes = np.where(dtree_opt.tree_.children_left == -1)[0]
leaf_node_values = dtree_opt.tree_.value[leaf_nodes]

print(f'Total number of valuable nodes is: {len(leaf_nodes)}')

for node, value in zip(leaf_nodes, leaf_node_values):
    class_counts = value[0]
    node_info = f"Node {node}: Class Counts = {class_counts}"
    print(node_info)


Total number of valuable nodes is: 10286
Node 7: Class Counts = [0. 2. 0. 0.]
Node 9: Class Counts = [ 0.  0. 69.  0.]
Node 12: Class Counts = [0. 0. 0. 1.]
Node 15: Class Counts = [0. 0. 0. 1.]
Node 18: Class Counts = [0. 0. 1. 0.]
Node 19: Class Counts = [0. 0. 0. 1.]
Node 20: Class Counts = [0. 0. 7. 0.]
Node 21: Class Counts = [0. 0. 9. 0.]
Node 26: Class Counts = [0. 0. 1. 0.]
Node 27: Class Counts = [0. 1. 0. 0.]
Node 28: Class Counts = [ 0.  0. 11.  0.]
Node 29: Class Counts = [0. 1. 0. 0.]
Node 30: Class Counts = [0. 2. 0. 0.]
Node 38: Class Counts = [0. 0. 1. 0.]
Node 39: Class Counts = [ 0. 24.  0.  0.]
Node 43: Class Counts = [0. 2. 0. 0.]
Node 44: Class Counts = [0. 2. 5. 1.]
Node 45: Class Counts = [0. 3. 0. 0.]
Node 46: Class Counts = [0. 4. 0. 0.]
Node 48: Class Counts = [0. 5. 0. 0.]
Node 52: Class Counts = [0. 0. 2. 0.]
Node 53: Class Counts = [0. 7. 2. 0.]
Node 54: Class Counts = [0. 4. 0. 0.]
Node 57: Class Counts = [0. 0. 2. 0.]
Node 58: Class Counts = [0. 4. 0. 0.]

Node 5918: Class Counts = [ 0. 61.  2.  0.]
Node 5921: Class Counts = [0. 0. 0. 1.]
Node 5922: Class Counts = [0. 5. 0. 0.]
Node 5924: Class Counts = [  0. 203.   0.   2.]
Node 5925: Class Counts = [  0. 361.   0.   0.]
Node 5929: Class Counts = [  5. 562.   0.   9.]
Node 5930: Class Counts = [  0. 636.   1.   3.]
Node 5932: Class Counts = [  0. 109.   0.   8.]
Node 5933: Class Counts = [ 0. 66.  2.  0.]
Node 5936: Class Counts = [  2. 236.   2.   5.]
Node 5937: Class Counts = [  0. 236.   5.  14.]
Node 5939: Class Counts = [ 0. 69.  0.  0.]
Node 5940: Class Counts = [1. 9. 0. 0.]
Node 5943: Class Counts = [0. 0. 0. 2.]
Node 5947: Class Counts = [0. 3. 1. 2.]
Node 5948: Class Counts = [ 0. 57.  0.  3.]
Node 5950: Class Counts = [  0. 211.   0.   0.]
Node 5951: Class Counts = [  0. 127.   0.   4.]
Node 5954: Class Counts = [  0. 269.   2.  18.]
Node 5955: Class Counts = [ 0. 40.  0.  9.]
Node 5957: Class Counts = [  0. 295.   0.   3.]
Node 5958: Class Counts = [ 0. 59.  1.  7.]
Node 596

Node 11147: Class Counts = [0. 0. 1. 0.]
Node 11149: Class Counts = [1. 0. 0. 0.]
Node 11151: Class Counts = [0. 0. 1. 0.]
Node 11152: Class Counts = [0. 0. 1. 2.]
Node 11155: Class Counts = [0. 0. 0. 2.]
Node 11158: Class Counts = [2. 0. 0. 0.]
Node 11159: Class Counts = [0. 1. 0. 0.]
Node 11160: Class Counts = [19.  0.  0.  0.]
Node 11161: Class Counts = [0. 4. 0. 0.]
Node 11167: Class Counts = [  0. 211.   0.   1.]
Node 11168: Class Counts = [0. 1. 0. 1.]
Node 11170: Class Counts = [  0. 276.   4.   3.]
Node 11171: Class Counts = [  0. 377.  23.   2.]
Node 11174: Class Counts = [0. 1. 1. 0.]
Node 11175: Class Counts = [0. 0. 3. 0.]
Node 11177: Class Counts = [0. 2. 0. 2.]
Node 11178: Class Counts = [ 0. 11.  0.  0.]
Node 11180: Class Counts = [2. 0. 0. 0.]
Node 11181: Class Counts = [0. 5. 0. 0.]
Node 11186: Class Counts = [  0. 131.   2.   0.]
Node 11187: Class Counts = [ 1. 23.  2.  0.]
Node 11189: Class Counts = [0. 4. 7. 0.]
Node 11190: Class Counts = [ 0. 16.  0.  0.]
Node 1119

Node 16376: Class Counts = [0. 1. 0. 2.]
Node 16379: Class Counts = [0. 9. 0. 0.]
Node 16380: Class Counts = [0. 0. 2. 0.]
Node 16381: Class Counts = [0. 0. 2. 0.]
Node 16385: Class Counts = [  0. 127.  15.  12.]
Node 16386: Class Counts = [  0. 142.   3.   6.]
Node 16387: Class Counts = [0. 0. 2. 0.]
Node 16390: Class Counts = [  5. 174.  10.  19.]
Node 16391: Class Counts = [ 0. 55.  0. 20.]
Node 16393: Class Counts = [0. 0. 0. 3.]
Node 16394: Class Counts = [  0. 127.  21.  24.]
Node 16400: Class Counts = [ 0. 37.  0.  0.]
Node 16401: Class Counts = [0. 1. 0. 1.]
Node 16402: Class Counts = [0. 0. 0. 1.]
Node 16405: Class Counts = [0. 2. 0. 0.]
Node 16406: Class Counts = [0. 0. 0. 3.]
Node 16408: Class Counts = [0. 1. 1. 0.]
Node 16409: Class Counts = [0. 5. 0. 0.]
Node 16410: Class Counts = [ 0. 42.  0.  0.]
Node 16415: Class Counts = [0. 5. 0. 6.]
Node 16416: Class Counts = [0. 2. 1. 0.]
Node 16418: Class Counts = [0. 5. 0. 1.]
Node 16419: Class Counts = [0. 6. 5. 0.]
Node 16422: C

In [36]:
leaf_nodes = np.where(dtree_opt.tree_.children_left == -1)[0]
counter = 0

for node in leaf_nodes:
    class_counts = dtree_opt.tree_.value[node][0]
    if class_counts[0] == 0 and class_counts[1] == 0:
        node_info = f"Node {node}: Class Counts = {class_counts}"
        print(node_info)
        counter+=1

print(f'Total number of valuable nodes is: {counter}')

Node 9: Class Counts = [ 0.  0. 69.  0.]
Node 12: Class Counts = [0. 0. 0. 1.]
Node 15: Class Counts = [0. 0. 0. 1.]
Node 18: Class Counts = [0. 0. 1. 0.]
Node 19: Class Counts = [0. 0. 0. 1.]
Node 20: Class Counts = [0. 0. 7. 0.]
Node 21: Class Counts = [0. 0. 9. 0.]
Node 26: Class Counts = [0. 0. 1. 0.]
Node 28: Class Counts = [ 0.  0. 11.  0.]
Node 38: Class Counts = [0. 0. 1. 0.]
Node 52: Class Counts = [0. 0. 2. 0.]
Node 57: Class Counts = [0. 0. 2. 0.]
Node 61: Class Counts = [0. 0. 7. 0.]
Node 66: Class Counts = [0. 0. 0. 1.]
Node 71: Class Counts = [0. 0. 0. 1.]
Node 77: Class Counts = [0. 0. 0. 2.]
Node 85: Class Counts = [ 0.  0. 13.  0.]
Node 88: Class Counts = [ 0.  0. 24.  0.]
Node 94: Class Counts = [0. 0. 2. 0.]
Node 106: Class Counts = [ 0.  0. 12.  0.]
Node 120: Class Counts = [0. 0. 1. 0.]
Node 121: Class Counts = [0. 0. 1. 0.]
Node 132: Class Counts = [0. 0. 6. 0.]
Node 135: Class Counts = [0. 0. 1. 0.]
Node 139: Class Counts = [0. 0. 2. 0.]
Node 141: Class Counts = 

In [42]:
def get_path(node, path):
    if node == -1:
        return path
    feature = dtree_opt.tree_.feature[node]
    threshold = dtree_opt.tree_.threshold[node]
    if feature != -2:  # Not a leaf node
        feature_name = features[feature]
        if path:
            current_path = f"{feature_name} {path}"
        else:
            current_path = feature_name
        parent = np.where(dtree_opt.tree_.children_left == node)[0]
        if parent:
            return get_path(parent[0], current_path)
    return path

leaf_nodes = np.where(dtree_opt.tree_.children_left == -1)[0]

for node in leaf_nodes:
    class_counts = dtree_opt.tree_.value[node][0]
    if class_counts[0] == 0 and class_counts[1] == 0:
        path = get_path(node, "")
        node_info = f"Node {node}: Class Counts = {class_counts}, Rule: {path}"
        print(node_info)


Node 9: Class Counts = [ 0.  0. 69.  0.], Rule: 
Node 12: Class Counts = [0. 0. 0. 1.], Rule: 
Node 15: Class Counts = [0. 0. 0. 1.], Rule: 
Node 18: Class Counts = [0. 0. 1. 0.], Rule: 
Node 19: Class Counts = [0. 0. 0. 1.], Rule: 
Node 20: Class Counts = [0. 0. 7. 0.], Rule: 
Node 21: Class Counts = [0. 0. 9. 0.], Rule: 
Node 26: Class Counts = [0. 0. 1. 0.], Rule: 
Node 28: Class Counts = [ 0.  0. 11.  0.], Rule: 
Node 38: Class Counts = [0. 0. 1. 0.], Rule: 
Node 52: Class Counts = [0. 0. 2. 0.], Rule: 
Node 57: Class Counts = [0. 0. 2. 0.], Rule: 
Node 61: Class Counts = [0. 0. 7. 0.], Rule: 
Node 66: Class Counts = [0. 0. 0. 1.], Rule: 
Node 71: Class Counts = [0. 0. 0. 1.], Rule: 
Node 77: Class Counts = [0. 0. 0. 2.], Rule: 
Node 85: Class Counts = [ 0.  0. 13.  0.], Rule: 
Node 88: Class Counts = [ 0.  0. 24.  0.], Rule: 
Node 94: Class Counts = [0. 0. 2. 0.], Rule: 
Node 106: Class Counts = [ 0.  0. 12.  0.], Rule: 
Node 120: Class Counts = [0. 0. 1. 0.], Rule: 
Node 121: Cla

Node 16247: Class Counts = [0. 0. 1. 0.], Rule: 
Node 16257: Class Counts = [ 0.  0.  0. 18.], Rule: 
Node 16269: Class Counts = [0. 0. 1. 0.], Rule: 
Node 16270: Class Counts = [0. 0. 0. 2.], Rule: 
Node 16271: Class Counts = [0. 0. 3. 0.], Rule: 
Node 16276: Class Counts = [0. 0. 0. 7.], Rule: 
Node 16277: Class Counts = [0. 0. 1. 0.], Rule: 
Node 16278: Class Counts = [ 0.  0.  0. 28.], Rule: 
Node 16281: Class Counts = [0. 0. 1. 0.], Rule: 
Node 16282: Class Counts = [0. 0. 0. 1.], Rule: 
Node 16285: Class Counts = [ 0.  0.  0. 11.], Rule: 
Node 16293: Class Counts = [0. 0. 1. 1.], Rule: 
Node 16294: Class Counts = [ 0.  0.  0. 10.], Rule: 
Node 16298: Class Counts = [0. 0. 0. 5.], Rule: 
Node 16299: Class Counts = [0. 0. 1. 0.], Rule: 
Node 16300: Class Counts = [0. 0. 3. 0.], Rule: 
Node 16304: Class Counts = [0. 0. 2. 0.], Rule: 
Node 16308: Class Counts = [0. 0. 0. 4.], Rule: 
Node 16311: Class Counts = [0. 0. 0. 7.], Rule: 
Node 16314: Class Counts = [0. 0. 2. 0.], Rule: 
Node

In [46]:
dtree_opt.tree_.node_count

20571

In [48]:
dtree_opt.tree_.max_depth

15

In [52]:
dtree_opt.tree_.value

array([[[1.926600e+04, 1.853094e+06, 1.134600e+05, 9.511100e+04]],

       [[1.660800e+04, 1.462509e+06, 4.570600e+04, 4.483800e+04]],

       [[1.501900e+04, 1.840430e+05, 2.075700e+04, 3.058000e+03]],

       ...,

       [[0.000000e+00, 0.000000e+00, 2.000000e+00, 0.000000e+00]],

       [[0.000000e+00, 1.000000e+00, 0.000000e+00, 0.000000e+00]],

       [[0.000000e+00, 1.500000e+01, 0.000000e+00, 0.000000e+00]]])

In [58]:
dtree_opt.children_left[0]

AttributeError: 'DecisionTreeClassifier' object has no attribute 'children_left'

In [64]:
dtree_opt.tree_.left_children[0]

AttributeError: 'sklearn.tree._tree.Tree' object has no attribute 'left_children'

In [68]:
tree.export_text(dtree_opt, feature_names=features)

'|--- Precipitation(in) <= 0.00\n|   |--- Distance(mi) <= 0.00\n|   |   |--- Crossing <= 0.50\n|   |   |   |--- Pressure(in) <= 24.71\n|   |   |   |   |--- Traffic_Signal <= 0.50\n|   |   |   |   |   |--- Pressure(in) <= 22.26\n|   |   |   |   |   |   |--- Pressure(in) <= 20.09\n|   |   |   |   |   |   |   |--- class: 2\n|   |   |   |   |   |   |--- Pressure(in) >  20.09\n|   |   |   |   |   |   |   |--- Pressure(in) <= 20.80\n|   |   |   |   |   |   |   |   |--- class: 3\n|   |   |   |   |   |   |   |--- Pressure(in) >  20.80\n|   |   |   |   |   |   |   |   |--- Temperature(F) <= 36.50\n|   |   |   |   |   |   |   |   |   |--- Pressure(in) <= 20.82\n|   |   |   |   |   |   |   |   |   |   |--- class: 4\n|   |   |   |   |   |   |   |   |   |--- Pressure(in) >  20.82\n|   |   |   |   |   |   |   |   |   |   |--- Wind_Speed(mph) <= 7.20\n|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4\n|   |   |   |   |   |   |   |   |   |   |--- Wind_Speed(mph) >  7.20\n|  

In [80]:
def get_rules(tree, feature_names, class_names, min_samples=0):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    rules = []
    for path in paths:
        class_counts = path[-1][0][0]
        if (class_counts[0] == 0 and class_counts[1] == 0 and class_counts[2] != 0 and class_counts[3] == 0) or \
           (class_counts[0] == 0 and class_counts[1] == 0 and class_counts[2] == 0 and class_counts[3] != 0):
            if path[-1][1] >= min_samples:
                rule = "if "
                for p in path[:-1]:
                    if rule != "if ":
                        rule += " and "
                    rule += str(p)
                rule += " then "
                if class_names is None:
                    rule += "response: " + str(np.round(path[-1][0][0][0], 3))
                else:
                    classes = path[-1][0][0]
                    l = np.argmax(classes)
                    rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes), 2)}%)"
                rule += f" | based on {path[-1][1]:,} samples"

                # Compute support and confidence
                support = path[-1][1] / tree_.n_node_samples[0]
                confidence = classes[l] / np.sum(classes)

                # Add support and confidence to the rule
                rule += f" | support: {support:.3f} | confidence: {confidence:.3f}"
                rules += [rule]
        
    return rules

rules = get_rules(dtree_opt, features, [1,2,3,4], min_samples=10)

with open("temp1.txt", "w") as file:
    for r in rules:
        file.write(r + "\n")


In [81]:
rules = get_rules(dtree_opt, features, [1,2,3,4], min_samples=10)

with open("temp1.txt", "w") as file:
    for r in rules:
        file.write(r + "\n")