-
Notifications
You must be signed in to change notification settings - Fork 14
/
inTrees.py
309 lines (237 loc) · 10.5 KB
/
inTrees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
inTrees / STEL
--------------
Merges different decision trees in an ensemble together in an ordered rule list
Written by Gilles Vandewiele in commission of IDLab - INTEC from University Ghent.
Reference:
Houtao Deng
"Interpreting Tree Ensembles with inTrees"
"""
import sys
import re
import numpy as np
import pandas as pd
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from constructors import ensemble
sys.path.append('../')
def convert_to_r_posixct(obj):
"""
Convert DatetimeIndex or np.datetime array to R POSIXct using
m8[s] format.
Parameters
----------
obj : source pandas object (one of [DatetimeIndex, np.datetime])
Returns
-------
An R POSIXct vector (rpy2.robjects.vectors.POSIXct)
"""
import time
from rpy2.rinterface import StrSexpVector
# convert m8[ns] to m8[s]
vals = robj.vectors.FloatSexpVector(obj.values.view('i8') / 1E9)
as_posixct = robj.baseenv.get('as.POSIXct')
origin = StrSexpVector([time.strftime("%Y-%m-%d",
time.gmtime(0)), ])
# We will be sending ints as UTC
tz = obj.tz.zone if hasattr(
obj, 'tz') and hasattr(obj.tz, 'zone') else 'UTC'
tz = StrSexpVector([tz])
utc_tz = StrSexpVector(['UTC'])
posixct = as_posixct(vals, origin=origin, tz=utc_tz)
posixct.do_slot_assign('tzone', tz)
return posixct
class Condition:
"""
Class which represents one part of the rule (which can be seen as a conjunction of conditions)
"""
def __init__(self, feature, test, value):
self.feature = feature
'''The feature on which the test is performed'''
self.test = test
'''What kind of test is done. Must be either `==`, `>` or `<=`'''
self.value = value
'''The threshold value'''
def evaluate(self, feature_vector):
"""Create a prediction for a sample (using its feature vector)
**Params**
----------
- `feature_vector` (pandas Series or dict) - the sample to evaluate, must be a `pandas Series` object or a
`dict`. It is important that the attribute keys in the sample are the same as the labels occuring in the rules.
**Returns**
-----------
`True` if feature_vector[<feature>] <test> <value>, where <test> is equal to `==`, `>` or `<=`
"""
if self.value is None:
return True
elif self.test == '==':
return feature_vector[self.feature] == self.value
elif self.test == '>':
return feature_vector[self.feature] > self.value
else:
return feature_vector[self.feature] <= self.value
class Rule:
"""
Class which represents a rule, which is a conjunction of conditions
"""
def __init__(self, index, conditions, prediction):
self.index = index
'''The index of this rule in a rule list (which is traversed sequentially until a match is found).'''
self.rules = conditions
'''A list of `constructors.inTrees.Condition`'''
self.prediction = prediction
'''This is returned when a sample fully complies to the rule (`True` for all conditions)'''
def evaluate(self, feature_vector):
"""Create a prediction for a sample (using its feature vector)
**Params**
----------
- `feature_vector` (pandas Series or dict) - the sample to evaluate, must be a `pandas Series` object or a
`dict`. It is important that the attribute keys in the sample are the same as the labels occuring in the rules.
**Returns**
-----------
`True` if `True` for each condition in conditions
"""
for rule in self.rules:
if not rule.evaluate(feature_vector): return False, -1
return True, self.prediction
class OrderedRuleList:
"""
Class which represents a list of rules. To make a prediction, the list is traversed and when a rule is found where
the sample complies to, its prediction is returned.
"""
def __init__(self, rule_list):
self.rule_list = rule_list
'''A list of `constructors.inTrees.Rule`'''
def _evaluate(self, feature_vector):
for ruleset in sorted(self.rule_list, key=lambda x: x.index): # Sort to make sure they are evaluated in order
rule_evaluation_result, rule_evaluation_pred = ruleset.evaluate(feature_vector)
if rule_evaluation_result: return rule_evaluation_pred
return None
def print_rules(self):
"""Print the rules"""
for rule_set in self.rule_list:
print '*' + ' & '.join([str(rule.feature)+' '+str(rule.test)+' '+str(rule.value) for rule in rule_set.rules]), '==>', rule_set.prediction
def evaluate_multiple(self, feature_vectors):
"""Wrapper method to evaluate multiple vectors at once (just a for loop where evaluate is called)
**Params**
----------
- `feature_vectors` (pandas DataFrame or list of dicts) - the samples to evaluate
**Returns**
-----------
a class label
"""
results = []
for _index, feature_vector in feature_vectors.iterrows():
results.append(self._evaluate(feature_vector))
return np.asarray(results)
class inTreesClassifier:
def __init__(self):
pass
def _convert_to_r_dataframe(self, df, strings_as_factors=False):
"""
Convert a pandas DataFrame to a R data.frame.
Parameters
----------
df: The DataFrame being converted
strings_as_factors: Whether to turn strings into R factors (default: False)
Returns
-------
A R data.frame
"""
import rpy2.rlike.container as rlc
columns = rlc.OrdDict()
# FIXME: This doesn't handle MultiIndex
for column in df:
value = df[column]
value_type = value.dtype.type
if value_type == np.datetime64:
value = convert_to_r_posixct(value)
else:
value = [item if pd.notnull(item) else rpy2.rinterface.NA_Integer#com.NA_TYPES[value_type]
for item in value]
value = rpy2.robjects.vectors.FloatVector(value)#com.VECTOR_TYPES[value_type](value)
if not strings_as_factors:
I = ro.baseenv.get("I")
value = I(value)
columns[column] = value
r_dataframe = ro.DataFrame(columns)
del columns
r_dataframe.rownames = ro.StrVector(list(df.index))
r_dataframe.colnames = list(df.columns)
return r_dataframe
def _tree_to_R_object(self, tree, feature_mapping):
node_mapping = {}
nodes = tree._get_nodes()
nodes.extend(tree._get_leaves())
for i, node in enumerate(nodes):
node_mapping[node] = i+1
vectors = []
for node in nodes:
if node.value is not None:
vectors.append([node_mapping[node], node_mapping[node.left], node_mapping[node.right],
feature_mapping[node.label], node.value, 1, 0])
else:
vectors.append([node_mapping[node], 0, 0, 0, 0.0, -1, node.label])
df = pd.DataFrame(vectors)
df.columns = ['id', 'left daughter', 'right daughter', 'split var', 'split point', 'status', 'prediction']
df = df.set_index('id')
df.index.name = None
return self._convert_to_r_dataframe(df)
def construct_rule_list(self, train_df, label_col, tree_constructors, nr_bootstraps=3):
""" Construct an `constructors.inTrees.OrderedRuleList` from an ensemble of decision trees
**Params**
----------
- `train_df` (pandas DataFrame) - the training data
- `label_col` (string) - the column identifier for the class labels
- `tree_constructors` (`constructors.treeconstructor.TreeConstructor`) - the decision tree induction algorithms used to create an ensemble with
- `nr_bootstraps` (pandas DataFrame) - how many times do we apply bootstrapping for each TreeConstructor? The size of the ensemble will be equal to
|tree_constructors|*nr_bootstraps
**Returns**
-----------
an OrderedRuleList
"""
y_train = train_df[label_col]
X_train = train_df.copy()
X_train = X_train.drop(label_col, axis=1)
importr('randomForest')
importr('inTrees')
ro.globalenv["X"] = pandas2ri.py2ri(X_train)
ro.globalenv["target"] = ro.FactorVector(y_train.values.tolist())
feature_mapping = {}
feature_mapping_reverse = {}
for i, feature in enumerate(X_train.columns):
feature_mapping[feature] = i + 1
feature_mapping_reverse[i + 1] = feature
treeList = []
for tree in ensemble.bootstrap(train_df, label_col, tree_constructors, nr_classifiers=nr_bootstraps):
if tree.count_nodes() > 1: treeList.append(self._tree_to_R_object(tree, feature_mapping))
ro.globalenv["treeList"] = ro.Vector([len(treeList), ro.Vector(treeList)])
ro.r('names(treeList) <- c("ntree", "list")')
rules = ro.r('buildLearner(getRuleMetric(extractRules(treeList, X), X, target), X, target)')
rules=list(rules)
conditions=rules[int(0.6*len(rules)):int(0.8*len(rules))]
predictions=rules[int(0.8*len(rules)):]
# Create a OrderedRuleList
rulesets = []
for idx, (condition, prediction) in enumerate(zip(conditions, predictions)):
# Split each condition in Rules to form a RuleSet
rulelist = []
condition_split = [x.lstrip().rstrip() for x in condition.split('&')]
for rule in condition_split:
feature = feature_mapping_reverse[int(re.findall(r',[0-9]+]', rule)[0][1:-1])]
lte = re.findall(r'<=', rule)
gt = re.findall(r'>', rule)
eq = re.findall(r'==', rule)
cond = lte[0] if len(lte) else (gt[0] if len(gt) else eq[0])
extract_value = re.findall(r'[=>]-?[0-9\.]+', rule)
if len(extract_value):
value = float(re.findall(r'[=>]-?[0-9\.]+', rule)[0][1:])
else:
feature = 'True'
value = None
rulelist.append(Condition(feature, cond, value))
rulesets.append(Rule(idx, rulelist, prediction))
return OrderedRuleList(rulesets)