forked from rabitt/contour_classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clf_utils.py
166 lines (136 loc) · 5.08 KB
/
clf_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
""" Utilities for classifier experiments """
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import cross_validation
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
def cross_val_sweep(x_train, y_train, max_search=100,
step=5, plot=True):
""" Choose best parameter by performing cross fold validation
Parameters
----------
x_train : np.array [n_samples, n_features]
Training features.
y_train : np.array [n_samples]
Training labels
max_search : int
Maximum depth value to sweep
step : int
Step size in parameter sweep
plot : bool
If true, plot error bars and cv accuracy
Returns
-------
best_depth : int
Optimal max_depth parameter
max_cv_accuracy : DataFrames
Best accuracy achieved on hold out set with optimal parameter.
"""
scores = []
for max_depth in np.arange(5, max_search, step):
print "training with max_depth=%s" % max_depth
clf = RFC(n_estimators=100, max_depth=max_depth, n_jobs=-1,
class_weight='auto', max_features=None)
all_scores = cross_validation.cross_val_score(clf, x_train, y_train,
cv=5)
scores.append([max_depth, np.mean(all_scores), np.std(all_scores)])
depth = [score[0] for score in scores]
accuracy = [score[1] for score in scores]
std_dev = [score[2] for score in scores]
if plot:
plt.errorbar(depth, accuracy, std_dev, linestyle='-', marker='o')
plt.title('Mean cross validation accuracy')
plt.xlabel('max depth')
plt.ylabel('mean accuracy')
plt.show()
best_depth = depth[np.argmax(accuracy)]
max_cv_accuracy = np.max(accuracy)
plot_data = (depth, accuracy, std_dev)
return best_depth, max_cv_accuracy, plot_data
def train_clf(x_train, y_train, best_depth):
""" Train classifier.
Parameters
----------
x_train : np.array [n_samples, n_features]
Training features.
y_train : np.array [n_samples]
Training labels
best_depth : int
Optimal max_depth parameter
Returns
-------
clf : classifier
Trained scikit-learn classifier
"""
clf = RFC(n_estimators=100, max_depth=best_depth, n_jobs=-1,
class_weight='auto', max_features=None)
clf = clf.fit(x_train, y_train)
return clf
def clf_predictions(x_train, x_valid, x_test, clf):
""" Compute probability predictions for all training and test examples.
Parameters
----------
x_train : np.array [n_samples, n_features]
Training features.
x_test : np.array [n_samples, n_features]
Testing features.
clf : classifier
Trained scikit-learn classifier
Returns
-------
p_train : np.array [n_samples]
predicted probabilities for training set
p_test : np.array [n_samples]
predicted probabilities for testing set
"""
p_train = clf.predict_proba(x_train)[:, 1]
p_valid = clf.predict_proba(x_valid)[:, 1]
p_test = clf.predict_proba(x_test)[:, 1]
return p_train, p_valid, p_test
def clf_metrics(p_train, p_test, y_train, y_test):
""" Compute metrics on classifier predictions
Parameters
----------
p_train : np.array [n_samples]
predicted probabilities for training set
p_test : np.array [n_samples]
predicted probabilities for testing set
y_train : np.array [n_samples]
Training labels.
y_test : np.array [n_samples]
Testing labels.
Returns
-------
clf_scores : dict
classifier scores for training set
"""
y_pred_train = 1*(p_train >= 0.5)
y_pred_test = 1*(p_test >= 0.5)
train_scores = {}
test_scores = {}
train_scores['accuracy'] = metrics.accuracy_score(y_train, y_pred_train)
test_scores['accuracy'] = metrics.accuracy_score(y_test, y_pred_test)
train_scores['mcc'] = metrics.matthews_corrcoef(y_train, y_pred_train)
test_scores['mcc'] = metrics.matthews_corrcoef(y_test, y_pred_test)
(p, r, f, s) = metrics.precision_recall_fscore_support(y_train,
y_pred_train)
train_scores['precision'] = p
train_scores['recall'] = r
train_scores['f1'] = f
train_scores['support'] = s
(p, r, f, s) = metrics.precision_recall_fscore_support(y_test,
y_pred_test)
test_scores['precision'] = p
test_scores['recall'] = r
test_scores['f1'] = f
test_scores['support'] = s
train_scores['confusion matrix'] = \
metrics.confusion_matrix(y_train, y_pred_train, labels=[0, 1])
test_scores['confusion matrix'] = \
metrics.confusion_matrix(y_test, y_pred_test, labels=[0, 1])
train_scores['auc score'] = \
metrics.roc_auc_score(y_train, p_train + 1, average='weighted')
test_scores['auc score'] = \
metrics.roc_auc_score(y_test, p_test + 1, average='weighted')
clf_scores = {'train': train_scores, 'test': test_scores}
return clf_scores