In [39]:
!pip install deslib



In [40]:
import warnings
warnings.filterwarnings('ignore')

In [41]:
import pandas as pd
import numpy as np
from collections import Counter

from scipy.io import arff
import operator
from functools import partial
from deslib.util.instance_hardness import kdn_score
from deslib.util import diversity
from prefit_voting_classifier import PrefitVotingClassifier

from sklearn.linear_model import Perceptron
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE

load dataset .. 

Dois bancos de dados binários e com atributos numéricos do [repositório Promise](http://promise.site.uottawa.ca/SERepository/datasets-page.html).

In [42]:
def load_data(file_name): 
	full_filepath = file_name 

	data, _ = arff.loadarff(full_filepath) 

	df = pd.DataFrame(data)
	df.dropna(inplace=True)
	df[df.columns[-1]] = df.apply(lambda x:  x[df.columns[-1]].decode(), axis=1)
		
	labels = pd.DataFrame(df[df.columns[-1]])
	instances = df.drop([df.columns[-1]], axis=1)
   
  # pre-processing 
	print('Original dataset shape %s' % labels.value_counts())
	sm = SMOTE(random_state=42)
	instances, labels = sm.fit_resample(instances, labels)
	print('Resampled dataset shape %s' % Counter(labels)) 
	instances = pd.DataFrame(preprocessing.scale(instances))
  #replace labels 
	labels = pd.DataFrame(labels)
	labels = labels.replace(["false", "true"],[0,1])  
	return instances,labels

In [43]:
datasets_filenames = ['cm1.arff', 'kc1.arff']

instances ={}
labels = {}

print("load data")
for name in datasets_filenames:
  print(name)
  instances[name], labels[name] = load_data(name)

load data
cm1.arff
Original dataset shape defects
false      449
true        49
dtype: int64
Resampled dataset shape Counter({'false': 449, 'true': 449})
kc1.arff
Original dataset shape defects
false      1783
true        326
dtype: int64
Resampled dataset shape Counter({'false': 1783, 'true': 1783})


Suponha três variantes do conjunto de validação:

(a) o conjunto original 𝒱; 

(b) o conjunto 𝒱’ contendo apenas as instâncias difíceis (kDN > 0,5) de 𝒱;

(c) o conjunto 𝒱’ contendo apenas as instâncias fáceis (kDN <= 0,5) de 𝒱.

In [44]:
threshold = 0.5

kdn_conf = [("None", partial(operator.gt, 2)), ("Hard", partial(operator.lt, threshold)), ("Easy", partial(operator.gt, threshold))]

In [45]:
def _filter_based_hardness(instances, labels, hards, op):

  triples = [(instances[i], labels[i], hards[i]) for i in range(len(hards))]

  return filter(lambda t: op(t[2]), triples)

In [46]:
def select_val_set(instances, labels, kdn_config, k):
  
  kdn_scores, neighbors = kdn_score(instances,labels.flatten(), k)

  filtered = _filter_based_hardness(instances, labels, kdn_scores, kdn_config)

  X_val = []
  y_val = []

  for t in filtered:
    X_val.append(t[0])
    y_val.append(t[1])

  return np.array(X_val), np.array(y_val)

**Best first**

1. Ordena os classificadores de acordo com o erro

2. Inicializa o ensemble com o melhor classificador (menor erro)

3. Insere o próximo melhor classificador no ensemble e avalia o erro

4. Retorna o ensemble com menor erro



In [47]:
def _order_clfs(pool_clf, validation_instances, validation_labels):
	clfs = pool_clf.estimators_
	clfs_feats = pool_clf.estimators_features_
	predictions = [clf.predict(validation_instances) for clf in clfs]
	errors = [(1 - accuracy_score(validation_labels, predicted_labels)) for predicted_labels in predictions]
	triples = [(clfs[i], clfs_feats[i], errors[i]) for i in range(len(errors))]
	return sorted(triples, key=lambda t: t[2])

In [48]:
def _find_best_first(triples, validation_instances, validation_labels):
	best_ensemble_error = 100
	best_ensemble = None

	cur_clfs = []
	cur_feats = []
	for triple in triples:
		clf, clf_feat, error = triple
		cur_clfs.append(clf)
		cur_feats.append(clf_feat)
		ensemble = _get_voting_clf(cur_clfs, cur_feats)
		predicted = ensemble.predict(validation_instances)
		error = (1 - accuracy_score(validation_labels, predicted))

		if error < best_ensemble_error:
			best_ensemble_error = error
			best_ensemble = ensemble

	return best_ensemble

In [49]:
def _get_voting_clf(base_clfs, clfs_feats, weights=None):
	pool_size = len(base_clfs)
	clfs_tuples = [(str(i), base_clfs[i]) for i in range(pool_size)]
	if weights is None:
		return PrefitVotingClassifier(clfs_tuples, clfs_feats, voting = 'hard', weights=None)
	else:
		return PrefitVotingClassifier(clfs_tuples, clfs_feats, voting ='hard', weights=weights)

In [50]:
#from https://github.com/jpedrocm/pool-pruning-experiment/blob/master/code/prefit_voting_classifier.py
def _best_first_pruning(pool_clf, validation_instances, validation_labels):
	ordered_triples = _order_clfs(pool_clf, validation_instances, 
		                          validation_labels)

	return _find_best_first(ordered_triples, validation_instances, 
		                    validation_labels)

Medidas de diversidade

Disagreement measure


In [51]:
def divesity_measure(pool, valid_instances,valid_labels,strategy):
  pool_size = len(pool.estimators_)

  df_diversity = pd.DataFrame(columns = ['strategy', 'pool_size','disagreement_measure'])

  measure_dm = 0

  if pool_size <= 1:
    new_row = {'strategy':strategy,
               'pool_size':pool_size,
               'disagreement_measure': measure_dm}
    df_diversity = df_diversity.append(new_row, ignore_index=True)
    
    return df_diversity
    
  for i in range(pool_size-1):
    for j in range(i+1,pool_size):  
      y_pred1 =  pool.estimators_[i].predict(valid_instances)
      y_pred2 =  pool.estimators_[j].predict(valid_instances)

      measure_dm += diversity.disagreement_measure(valid_labels, y_pred1, y_pred2)

  new_row = {'strategy':strategy,
              'pool_size':pool_size,
              'disagreement_measure': (2*measure_dm)/(pool_size*(pool_size-1))}
  df_diversity = df_diversity.append(new_row, ignore_index=True)
  
  return df_diversity

taxa de acerto, AUC, g-mean e f-measure

K-fold

In [53]:
predictions = {}
subpredictions = {}
n_folds = 10

for ds_name in datasets_filenames: 
  print("Dataset name: ", ds_name)
  X = instances[ds_name]
  y = labels[ds_name]

  predictions[ds_name] = {}
  subpredictions[ds_name] = {}

  skf = StratifiedKFold(n_splits=n_folds)
  k = 5 # select 5 neighborns

  # split the dataset into 7 folds to train, 2 to validation, and 1 to test
  train_size = 0.70
  val_size = 0.20
  new_train_size = np.around(train_size / (val_size + train_size), 2)
  val_size = 1.0 - new_train_size

  for fold, division in enumerate(skf.split(X, y), 1):
    train_index, test_index = division[0], division[1] 

    train, valid = train_test_split(train_index, test_size=val_size)

    X_train, X_valid, X_test = X.iloc[train], X.iloc[valid], X.iloc[test_index]
    y_train, y_valid, y_test = y.iloc[train], y.iloc[valid], y.iloc[test_index]   

    predictions[ds_name][fold] = {}
    subpredictions[ds_name][fold] = {}

    predictions[ds_name][fold]["labels"] = np.array(y_test).tolist()

    # Use the KDN to select the validation set
    for hardness_type, filter_func in kdn_conf:
      X_val, y_val = select_val_set(np.array(X_valid), np.array(y_valid), filter_func, k)      
      
      clf_pool = BaggingClassifier(base_estimator=Perceptron(), n_estimators=100, random_state=0)
      clf_pool.fit(X_train, y_train)

      best_first = _best_first_pruning(clf_pool, X_val,y_val)

      pool_size = len(best_first.estimators_)

      cur_predictions_ = clf_pool.predict(np.array(X_test))
      cur_predictions_bf = best_first.predict(np.array(X_test))
      
      predictions[ds_name][fold][hardness_type] = {}
      predictions[ds_name][fold][hardness_type]['None'] = np.array(cur_predictions_).tolist()
      predictions[ds_name][fold][hardness_type]['best_first'] = np.array(cur_predictions_bf).tolist()

      pool_diversity = divesity_measure(clf_pool, X_val, y_val,'None')
      bf_diversity = divesity_measure(best_first, X_val,y_val, 'best_first')

      subpredictions[ds_name][fold][hardness_type] = [pool_diversity, bf_diversity]      

Dataset name:  cm1.arff
Dataset name:  kc1.arff


In [None]:
df_result = pd.DataFrame(columns = ['dataset','estrategia','pool_size','kdn','taxa_acerto','AUC', 'g-mean', 'f-measure','disagreement_measure'])
pruned_strategy = ['None','best_first']

for ds_name in datasets_filenames:   
  for hardness_type, filter_func in kdn_conf:
    i = -1
    for strategy in pruned_strategy:
      i += 1
      taxa_acerto = []
      auc = []
      gmean = []
      fmeasure = []
      pool_size =[]
      disagrement= []
      for fold in range(1,(n_folds+1)):
        _labels = predictions[ds_name][fold]["labels"]
        _labels_predict = predictions[ds_name][fold][hardness_type][strategy]

        df_data = subpredictions[ds_name][fold][hardness_type]

        taxa_acerto.append(accuracy_score(_labels,_labels_predict))
        auc.append(roc_auc_score(_labels,_labels_predict))
        g = geometric_mean_score(np.array(_labels),np.array(_labels_predict))
        gmean.append(g)
        fmeasure.append(f1_score(_labels,_labels_predict))
        pool_size.append(df_data[i].pool_size[0])
        disagrement.append(df_data[i].disagreement_measure[0])
      new_row = {'dataset': ds_name,
                 'estrategia':strategy,
                 'pool_size': "%d (%0.3f) %d" % (np.min(pool_size), np.mean(pool_size),np.max(pool_size)),
                'kdn':hardness_type,
                'taxa_acerto': "%0.3f (%0.3f)" % (np.mean(taxa_acerto), np.std(taxa_acerto)),
                'AUC': "%0.3f (%0.3f)" % (np.mean(auc), np.std(auc)),
                'g-mean': "%0.3f (%0.3f)" % (np.mean(gmean), np.std(gmean)),
                'f-measure': "%0.3f (%0.3f)" % (np.mean(fmeasure), np.std(fmeasure)),
                'disagreement_measure':"%0.3f (%0.3f)" % (np.mean(disagrement), np.std(disagrement))}
      df_result = df_result.append(new_row, ignore_index=True)
      
df_result.to_csv("/content/df_result.csv", index=False)
df_result