Skip to content

Commit

Permalink
Super!
Browse files Browse the repository at this point in the history
  • Loading branch information
diegoesteves committed Aug 27, 2018
1 parent ee9dcf9 commit cf03d14
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 200 deletions.
64 changes: 32 additions & 32 deletions python/defacto/definitions.py
Expand Up @@ -10,8 +10,9 @@
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, Ridge, PassiveAggressiveClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVC
from sklearn.svm import SVR, LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier
import numpy as np

Expand Down Expand Up @@ -76,7 +77,7 @@
# sampling parameters
CROSS_VALIDATION_K_FOLDS = 10

TEST_SIZE = 0.2
TEST_SIZE = 0.25

RANDOM_STATE = 53

Expand All @@ -98,7 +99,7 @@
BEST_PAD_EXPERIMENT_TYPE = EXP_5_CLASSES_LABEL
BEST_PAD_ALGORITHM = 'nb'

THRESHOLD_LABEL_2class = 0.70
THRESHOLD_LABEL_2class = 0.68
THRESHOLD_LABEL_3class = 0.45

# classifiers x hyper-parameters x search method
Expand All @@ -120,6 +121,7 @@
dt_param = trees_param_basic.copy()
dt_param["criterion"] = ['gini', 'entropy']

BEST_FEATURES_PERCENT = [1.0, 0.85, 0.70, 0.55, 0.40, 0.25, 0.10]

CONFIG_FEATURES_BASIC = ['basic',
['basic_text', 'domain', 'suffix', 'source', 'outbound_links_http', 'outbound_links_https',
Expand Down Expand Up @@ -157,47 +159,45 @@
CONFIG_FEATURES = [CONFIG_FEATURES_BASIC, CONFIG_FEATURES_BASIC_GI, CONFIG_FEATURES_ALL, CONFIG_FEATURES_ALL_HTML2SEQ]


CONFIGS_HIGH_DIMEN = [(MultinomialNB(), dict(alpha=[1.0, 0.7, 0.5, 0.1]), SEARCH_METHOD_GRID),
(BernoulliNB(), dict(alpha=[1.0, 0.7, 0.5, 0.1]), SEARCH_METHOD_GRID),
(GaussianNB(), None, None),
(LinearSVC(), dict(loss=['hinge', 'squared_hinge'], C=[1e0, 1e-1, 1e-2], multi_class=['ovr', 'crammer_singer']), SEARCH_METHOD_GRID),
#(KMeans(), dict(init=["k-means++", "random"], n_init=[5, 10], tol=[1e0, 1e-1, 1e-2],
# algorithm=['auto', 'elkan'], n_clusters=[2, 3, 5, 7, 10, 15]),
# SEARCH_METHOD_RANDOMIZED_GRID),
CONFIGS_HIGH_DIMEN_CLASSIFICATION = [(MultinomialNB(), dict(alpha=[1.0, 0.7, 0.5, 0.1]), SEARCH_METHOD_GRID),
(BernoulliNB(), dict(alpha=[1.0, 0.7, 0.5, 0.1]), SEARCH_METHOD_GRID),
(MLPClassifier(), dict(hidden_layer_sizes=[(50,), (100,), (200,)], activation=['logistic', 'tanh', 'relu'],
solver=['lbfgs', 'sgd', 'adam'], alpha=[1e0, 1e-1, 1e-2, 1e-3],
early_stopping=[True]), SEARCH_METHOD_RANDOMIZED_GRID),
(LinearSVC(), dict(loss=['hinge', 'squared_hinge'], C=[1e0, 1e-1, 1e-2], multi_class=['ovr', 'crammer_singer']), SEARCH_METHOD_GRID),
]

CONFIGS_HIGH_DIMEN_REGRESSION = [(MLPRegressor(), dict(hidden_layer_sizes=[(50,), (100,), (200,)], activation=['logistic', 'tanh', 'relu'],
solver=['lbfgs', 'sgd', 'adam'], alpha=[1e0, 1e-1, 1e-2, 1e-3],
early_stopping=[True]), SEARCH_METHOD_RANDOMIZED_GRID),
(LinearSVR(), dict(loss=['hinge', 'squared_hinge'], C=[1e0, 1e-1, 1e-2, 1e-3], tol=[1e0, 1e-1, 1e-2, 1e-3, 1e-4], epsilon=[0, 0.1]), SEARCH_METHOD_RANDOMIZED_GRID),
]

CONFIGS_REGRESSION = [(LogisticRegression(n_jobs=-1),
CONFIGS_REGRESSION = [(LogisticRegression(),
dict(alpha=[1e0, 1e-1, 1e-2, 1e-3], solver=["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
multi_class=["ovr", "multinomial"], tol=[1e0, 1e-1, 1e-2, 1e-3], penalty=["l1", "l2"],
multi_class=["ovr", "multinomial"], tol=[1e0, 1e-1, 1e-2, 1e-3, 1e-4], penalty=["l1", "l2"],
C=[0.1, 0.5, 1.0, 3.0, 5.0, 10.0, 50.0, 100.0]),
SEARCH_METHOD_RANDOMIZED_GRID),
(Ridge(), dict(alpha=[1e0, 1e-1, 1e-2, 1e-3],
solver=['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
tol=[1e0, 1e-1, 1e-2, 1e-3]),
tol=[1e0, 1e-1, 1e-2, 1e-3, 1e-4]),
SEARCH_METHOD_RANDOMIZED_GRID),
(SVR(), dict(epsilon=[1e0, 1e-1, 1e-2, 1e-3], kernel=["linear", "poly", "rbf", "sigmoid"],
tol=[1e0, 1e-1, 1e-2, 1e-3], C=[0.1, 0.5, 1.0, 3.0, 5.0, 10.0, 50.0, 100.0]), SEARCH_METHOD_RANDOMIZED_GRID)
]

CONFIGS_CLASSIFICATION = [
(DecisionTreeClassifier(), dt_param, SEARCH_METHOD_RANDOMIZED_GRID),
#(GradientBoostingClassifier(), gb_param, SEARCH_METHOD_RANDOMIZED_GRID),
#(RandomForestClassifier(n_jobs=-1), trees_param_bootstrap, SEARCH_METHOD_RANDOMIZED_GRID),
#(ExtraTreesClassifier(n_jobs=-1), trees_param_bootstrap, SEARCH_METHOD_RANDOMIZED_GRID),
#(BaggingClassifier(), {"n_estimators": [10, 25, 50, 100, 200, 400, 600, 1000, 1500, 2000],
# "base_estimator__max_depth": [1, 2, 3, 4, 5],
# "max_samples": [0.05, 0.1, 0.2, 0.5]}, SEARCH_METHOD_RANDOMIZED_GRID),
#(AdaBoostClassifier(), {"n_estimators": [10, 25, 50, 100, 200, 400, 600, 1000, 1500, 2000],
# "algorithm": ["SAMME", "SAMME.R"]}, SEARCH_METHOD_RANDOMIZED_GRID),
#(PassiveAggressiveClassifier(n_jobs=-1), {"tol": [1e0, 1e-1, 1e-2, 1e-3],
# "C": [0.1, 0.5, 1.0, 3.0, 5.0, 10.0, 50.0, 100.0],
# "loss": ["hinge", "squared_hinge"]}, SEARCH_METHOD_GRID),
#(SGDClassifier(n_jobs=-1), {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
# "penality": ["none", "l2", "l1", "elasticnet"],
# "alpha": [1e0, 1e-1, 1e-2, 1e-3],
# "tol": [1e0, 1e-1, 1e-2, 1e-3],
# "learning_rate": ["constant", "invscaling", "optimal"]},
# SEARCH_METHOD_RANDOMIZED_GRID),
(GradientBoostingClassifier(), gb_param, SEARCH_METHOD_RANDOMIZED_GRID),
(RandomForestClassifier(), trees_param_bootstrap, SEARCH_METHOD_RANDOMIZED_GRID),
(ExtraTreesClassifier(), trees_param_bootstrap, SEARCH_METHOD_RANDOMIZED_GRID),
(BaggingClassifier(), dict(n_estimators=[10, 25, 50, 100, 200, 400, 600, 1000, 1500, 2000], base_estimator__max_depth=[1, 2, 3, 4, 5],
max_samples=[0.05, 0.1, 0.2, 0.5]), SEARCH_METHOD_RANDOMIZED_GRID),
(AdaBoostClassifier(), dict(n_estimators=[10, 25, 50, 100, 200, 400, 600, 1000, 1500, 2000], algorithm=["SAMME", "SAMME.R"]), SEARCH_METHOD_RANDOMIZED_GRID),
(PassiveAggressiveClassifier(), dict(tol=[1e0, 1e-1, 1e-2, 1e-3], C=[0.1, 0.5, 1.0, 3.0, 5.0, 10.0, 50.0, 100.0], loss=["hinge", "squared_hinge"]), SEARCH_METHOD_RANDOMIZED_GRID),
(SGDClassifier(n_jobs=-1), dict(loss=["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
penality=["none", "l2", "l1", "elasticnet"], alpha=[1e0, 1e-1, 1e-2, 1e-3],
tol=[1e0, 1e-1, 1e-2, 1e-3], learning_rate=["constant", "invscaling", "optimal"]), SEARCH_METHOD_RANDOMIZED_GRID),
(BernoulliNB(), {"alpha": [1e0, 1e-1, 1e-2, 1e-3]}, SEARCH_METHOD_GRID),
##MLPClassifier(hidden_layer_sizes=(hidden_nodes,hidden_nodes,hidden_nodes), solver='adam', alpha=1e-05)
]
(MultinomialNB(), dict(alpha=[1.0, 0.7, 0.5, 0.1]), SEARCH_METHOD_GRID)]
#MLPClassifier(hidden_layer_sizes=(hidden_nodes,hidden_nodes,hidden_nodes), solver='adam', alpha=1e-05)
76 changes: 30 additions & 46 deletions python/trustworthiness/benchmark.py
Expand Up @@ -11,8 +11,9 @@
from defacto.definitions import OUTPUT_FOLDER, TEST_SIZE, \
HEADER, EXP_5_CLASSES_LABEL, EXP_3_CLASSES_LABEL, EXP_2_CLASSES_LABEL, LINE_TEMPLATE, \
LABELS_2_CLASSES, LABELS_5_CLASSES, CROSS_VALIDATION_K_FOLDS, SEARCH_METHOD_RANDOMIZED_GRID, SEARCH_METHOD_GRID, \
CONFIGS_CLASSIFICATION, CONFIGS_REGRESSION, CONFIGS_HIGH_DIMEN, LABELS_3_CLASSES, THRESHOLD_LABEL_2class, \
THRESHOLD_LABEL_3class, RANDOM_STATE
CONFIGS_CLASSIFICATION, CONFIGS_REGRESSION, CONFIGS_HIGH_DIMEN_CLASSIFICATION, LABELS_3_CLASSES, \
THRESHOLD_LABEL_2class, \
THRESHOLD_LABEL_3class, RANDOM_STATE, BEST_FEATURES_PERCENT
from trustworthiness.benchmark_utils import train_test_export_save_per_exp_type
from trustworthiness.feature_extractor import *
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold, RandomizedSearchCV
Expand Down Expand Up @@ -202,17 +203,16 @@ def feature_selection():
except:
raise

def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size):
def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size, perc_f):

config.logger.info('benchmark_text()')

try:
y_train_3 = [], y_test_3 = []
y_train_2 = [], y_test_2 = []

out_models_folder = OUTPUT_FOLDER + exp_folder + ds_folder + 'models/' + subfolder
if not os.path.exists(out_models_folder):
os.makedirs(out_models_folder)
path = OUTPUT_FOLDER + exp_folder + ds_folder + 'benchmark/' + subfolder


#input_layer_neurons = len(X) + 1
#output_layer_neurons = 1
Expand All @@ -237,9 +237,9 @@ def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size):

config.logger.debug('OK. feature selection')
# feature selection
X_best_5 = SelectPercentile(f_regression, 0.5).fit_transform(X, y5)
X_best_3 = SelectPercentile(chi2, 0.5).fit_transform(X, y3)
X_best_2 = SelectPercentile(chi2, 0.5).fit_transform(X, y2)
X_best_5 = SelectPercentile(f_regression, perc_f).fit_transform(X, y5)
X_best_3 = SelectPercentile(chi2, perc_f).fit_transform(X, y3)
X_best_2 = SelectPercentile(chi2, perc_f).fit_transform(X, y2)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
Expand All @@ -255,13 +255,15 @@ def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size):
x_axis_label = 'Classifiers'
y_axis_label = 'F1-measure'



# --------------------------------------------------------------------------------------------------------------
# classification experiment
# --------------------------------------------------------------------------------------------------------------
config.logger.info('starting experiments classification (2-classes and 3-classes)')
i = 1
for exp_type in (EXP_2_CLASSES_LABEL, EXP_3_CLASSES_LABEL):
with open(out_models_folder + exp_type + '/perf.classification.log', "w") as file_log_classification:
with open(path + exp_type + '/log/results.txt', "w") as file_log_classification:
file_log_classification.write(HEADER)
if exp_type == EXP_2_CLASSES_LABEL:
y_train = y_train_2
Expand Down Expand Up @@ -310,7 +312,7 @@ def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size):
# --------------------------------------------------------------------------------------------------------------
config.logger.info('starting experiments regression (5-classes)')

with open(out_models_folder + exp_type + '/perf.regression.log', "w") as file_log_regression:
with open(path + exp_type + '/log/results.txt', "w") as file_log_regression:
file_log_regression.write(HEADER)
for estimator, hyperparam, grid_method in CONFIGS_REGRESSION:
out = []
Expand All @@ -330,41 +332,23 @@ def benchmark(X, y5, exp_folder, ds_folder, subfolder, random_state, test_size):
ds = 'microsoft/'
K1 ='882'
exp ='exp010/'


EXP_CONFIGS = [
{'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.basic.' + K1 + '.pkl'},
{'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.basic_gi.' + K1 + '.pkl'},
{'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.all.' + K1 + '.pkl'},
{'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.all+html2seq.' + K1 + '.pkl'},
{'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.html2seq.' + K1 + '.pkl'},
]


exit(0)

# benchmarking text features and text features + html2seq (with best HTML2seq model)

for conf in EXP_CONFIGS:
config.logger.debug('')
config.logger.debug('------------------------------------------------------------------')
config.logger.debug('01. TEXT FEATURES (only)')
config.logger.debug('------------------------------------------------------------------')
config.logger.debug('')

features_tex, y5, y3, y2 = get_text_features(conf['EXP_FOLDER'], conf['DS_FOLDER'], conf['FEATURES_FILE'], html2seq=False)

benchmark(features_tex, y5, conf['EXP_FOLDER'], conf['DS_FOLDER'], 'text/', RANDOM_STATE, TEST_SIZE)

config.logger.debug('')
config.logger.debug('------------------------------------------------------------------')
config.logger.debug('02. TEXT + HTML2Seq features combined (out of best configurations)')
config.logger.debug('------------------------------------------------------------------')
config.logger.debug('')

features_combined, y5, y3, y2 = get_text_features(conf['EXP_FOLDER'], conf['DS_FOLDER'], conf['FEATURES_FILE'], html2seq=True)

benchmark(features_combined, y5, conf['EXP_FOLDER'], conf['DS_FOLDER'], 'text+html/', RANDOM_STATE, TEST_SIZE)
features_file = 'features.all+html2seq.2977.pkl'

verify_and_create_experiment_folders(exp, ds)

#EXP_CONFIGS = [
# {'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.basic.' + K1 + '.pkl'},
# {'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.basic_gi.' + K1 + '.pkl'},
# {'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.all.' + K1 + '.pkl'},
# {'EXP_FOLDER': exp, 'DS_FOLDER': ds, 'FEATURES_FILE': 'features.all+html2seq.' + K1 + '.pkl'},
#]

# benchmarking text features + html2seq (with best HTML2seq model)
config.logger.debug('02. TEXT + HTML2Seq features combined (out of best configurations)')
features_combined, y5, y3, y2 = get_text_features(exp, ds, features_file, html2seq=True)
for best_k_features in BEST_FEATURES_PERCENT:
benchmark(features_combined, y5, exp, ds, 'best_k/', RANDOM_STATE, TEST_SIZE, best_k_features)
config.logger.info('done!')


except Exception as e:
Expand Down

0 comments on commit cf03d14

Please sign in to comment.