This repository has been archived by the owner on May 31, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
trustscore.py
265 lines (233 loc) · 9.5 KB
/
trustscore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# Copyright 2019-2020 Abien Fred Agarap
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Modified trust score module from Google and Seldon Alibi"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
__author__ = "Arnaud Van Looveren, Abien Fred Agarap"
__version__ = "1.0.0"
import logging
import numpy as np
from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from typing import Tuple, Any
logger = logging.getLogger(__name__)
class TrustScore(object):
def __init__(
self,
k_filter: int = 10,
alpha: float = 0.0,
filter_type: str = None,
leaf_size: int = 40,
metric: str = "euclidean",
dist_filter_type: str = "point",
) -> None:
"""
Initialize trust scores.
Parameters
----------
k_filter
Number of neighbors used during either kNN distance or probability filtering.
alpha
Fraction of instances to filter out to reduce impact of outliers.
filter_type
Filter method; either 'distance_knn' or 'probability_knn'
leaf_size
Number of points at which to switch to brute-force. Affects speed and memory required to build trees.
Memory to store the tree scales with n_samples / leaf_size.
metric
Distance metric used for the tree. See sklearn's DistanceMetric class for a list of available metrics.
dist_filter_type
Use either the distance to the k-nearest point (dist_filter_type = 'point') or
the average distance from the first to the k-nearest point in the data (dist_filter_type = 'mean').
"""
self.k_filter = k_filter
self.alpha = alpha
self.filter = filter_type
self.eps = 1e-12
self.leaf_size = leaf_size
self.metric = metric
self.dist_filter_type = dist_filter_type
def filter_by_distance_knn(self, X: np.ndarray) -> np.ndarray:
"""
Filter out instances with low kNN density. Calculate distance to k-nearest point in the data for each
instance and remove instances above a cutoff distance.
Parameters
----------
X
Data
Returns
-------
Filtered data.
"""
kdtree = KDTree(X, leaf_size=self.leaf_size, metric=self.metric)
knn_r = kdtree.query(X, k=self.k_filter + 1)[
0
] # distances from 0 to k-nearest points
if self.dist_filter_type == "point":
knn_r = knn_r[:, -1]
elif self.dist_filter_type == "mean":
knn_r = np.mean(
knn_r[:, 1:], axis=1
) # exclude distance of instance to itself
cutoff_r = np.percentile(knn_r, (1 - self.alpha) * 100) # cutoff distance
X_keep = X[np.where(knn_r <= cutoff_r)[0], :] # define instances to keep
return X_keep
def filter_by_probability_knn(
self, X: np.ndarray, Y: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""
Filter out instances with high label disagreement amongst its k nearest neighbors.
Parameters
----------
X
Data
Y
Predicted class labels
Returns
-------
Filtered data and labels.
"""
if self.k_filter == 1:
logger.warning(
"Number of nearest neighbors used for probability density filtering should "
"be >1, otherwise the prediction probabilities are either 0 or 1 making "
"probability filtering useless."
)
# fit kNN classifier and make predictions on X
clf = KNeighborsClassifier(
n_neighbors=self.k_filter, leaf_size=self.leaf_size, metric=self.metric
)
clf.fit(X, Y)
preds_proba = clf.predict_proba(X)
# define cutoff and instances to keep
preds_max = np.max(preds_proba, axis=1)
cutoff_proba = np.percentile(preds_max, self.alpha * 100) # cutoff probability
keep_id = np.where(preds_max >= cutoff_proba)[
0
] # define id's of instances to keep
X_keep, Y_keep = X[keep_id, :], Y[keep_id]
return X_keep, Y_keep
def fit(self, X: np.ndarray, Y: np.ndarray, classes: int = None) -> None:
"""
Build KDTrees for each prediction class.
Parameters
----------
X
Data
Y
Target labels, either one-hot encoded or the actual class label.
classes
Number of prediction classes, needs to be provided if Y equals the predicted class.
"""
self.classes = classes if classes is not None else Y.shape[1]
self.kdtrees = [None] * self.classes # type: Any
self.X_kdtree = [None] * self.classes # type: Any
# KDTree and kNeighborsClassifier need 2D data
if len(X.shape) > 2:
logger.warning(
"Reshaping data from {0} to {1} so k-d trees can "
"be built.".format(X.shape, X.reshape(X.shape[0], -1).shape)
)
X = X.reshape(X.shape[0], -1)
# make sure Y represents predicted classes, not one-hot encodings
if len(Y.shape) > 1:
Y = np.argmax(Y, axis=1)
if self.filter == "probability_knn":
X_filter, Y_filter = self.filter_by_probability_knn(X, Y)
for c in range(self.classes):
if self.filter is None:
X_fit = X[np.where(Y == c)[0]]
elif self.filter == "distance_knn":
X_fit = self.filter_by_distance_knn(X[np.where(Y == c)[0]])
elif self.filter == "probability_knn":
X_fit = X_filter[np.where(Y_filter == c)[0]]
no_x_fit = len(X_fit) == 0
if no_x_fit and len(X[np.where(Y == c)[0]]) == 0:
logger.warning("No instances available for class %s", c)
elif no_x_fit:
logger.warning(
"Filtered all the instances for class %s. Lower alpha or check data.",
c,
)
self.kdtrees[c] = KDTree(
X_fit, leaf_size=self.leaf_size, metric=self.metric
) # build KDTree for class c
self.X_kdtree[c] = X_fit
def score(
self, X: np.ndarray, Y: np.ndarray, k: int = 2, dist_type: str = "point"
) -> Tuple[np.ndarray, np.ndarray]:
"""
Calculate trust scores = ratio of distance to closest class other than the
predicted class to distance to predicted class.
Parameters
----------
X
Instances to calculate trust score for.
Y
Either prediction probabilities for each class or the predicted class.
k
Number of nearest neighbors used for distance calculation.
dist_type
Use either the distance to the k-nearest point (dist_type = 'point') or
the average distance from the first to the k-nearest point in the data (dist_type = 'mean').
Returns
-------
Batch with trust scores and the closest not predicted class.
"""
# make sure Y represents predicted classes, not probabilities
if len(Y.shape) > 1:
Y = np.argmax(Y, axis=1)
# KDTree needs 2D data
if len(X.shape) > 2:
logger.warning(
"Reshaping data from {0} to {1} so k-d trees can "
"be queried.".format(X.shape, X.reshape(X.shape[0], -1).shape)
)
X = X.reshape(X.shape[0], -1)
d = np.tile(
None, (X.shape[0], self.classes)
) # init distance matrix: [nb instances, nb classes]
idx = np.tile(None, (X.shape[0], self.classes))
for c in range(self.classes):
d_tmp, d_tmp_idx = self.kdtrees[c].query(X, k=k)
if dist_type == "point":
d[:, c] = d_tmp[:, -1]
elif dist_type == "mean":
d[:, c] = np.mean(d_tmp, axis=1)
idx[:, c] = d_tmp_idx[:, -1]
sorted_d = np.sort(
d, axis=1
) # sort distance each instance in batch over classes
sorted_d_idx = np.sort(idx, axis=1)
# get distance to predicted and closest other class and calculate trust score
d_to_pred = d[range(d.shape[0]), Y]
d_to_pred_idx = idx[range(idx.shape[0]), Y]
d_to_closest_not_pred = np.where(
sorted_d[:, 0] != d_to_pred, sorted_d[:, 0], sorted_d[:, 1]
)
d_to_closest_not_pred_idx = np.where(
sorted_d[:, 0] != d_to_pred, sorted_d_idx[:, 0], sorted_d_idx[:, 1]
)
trust_score = d_to_closest_not_pred / (d_to_pred + self.eps)
# closest not predicted class
class_closest_not_pred = np.where(d == d_to_closest_not_pred.reshape(-1, 1))[1]
return (
trust_score,
class_closest_not_pred,
d_to_pred_idx,
d_to_closest_not_pred_idx,
d_to_pred,
d_to_closest_not_pred,
)