# Evaluator

In [3]:
from src.read.processing import Processing

In [4]:
data_loader = Processing()
data_loader.read("glass")
data_loader.general_preprocessing()
df = data_loader.df
df.head()



Unnamed: 0,RI,Na,Mg,Al,Si,Ca,Ba,Fe,Type
0,-0.143715,-0.758384,0.566677,-0.652289,0.490551,-0.13168,-0.352877,-0.586451,0
1,-0.638803,-1.531681,0.580575,-0.190536,0.309376,-0.300715,-0.352877,-0.586451,1
2,-0.143715,-0.242853,0.552779,-0.070079,-0.014151,-0.371146,-0.352877,-0.586451,0
3,-1.774207,1.21782,-0.656366,0.190912,2.457593,-0.962769,-0.352877,-0.586451,2
4,5.137232,-1.359837,-1.865511,-0.893204,-3.223534,5.094318,-0.352877,1.882411,3


# Evaluation Class

In [5]:
import config
from itertools import product
import pandas as pd

class Evaluator:

    def __init__(self, 
                 model, 
                 data_loader, 
                 target: str, 
                 d_metrics: dict=config.D_METRICS,
                 d_hyperparams: dict=config.D_HYPERPARAMS):
        '''
        Input:
            model: class of the model to be evaluated 
                    (not instantiated, e.g. KMeans not KMeans())
            data_loader: instance of the class Processing
            target: name of the target column
            d_metrics: dictionary with the metrics to be used
            d_hyperparams: dictionary with the hyperparameters to be used

        Attributes:
            model, target, d_metrics, d_hyperparams
            
            data: dataframe with the data from data_loader
            y_true: true labels (target column from data)
        '''
        self.model = model
        self.data = data_loader.df
        self.target = target
        self.y_true = self.data[self.target]
        self.d_metrics = d_metrics
        self.d_hyperparams = d_hyperparams
        self.df_results = None

    def get_best_hyperparams(self, metric: str):
        '''
        Input:
            metric: name of the metric to be used
        Output:
            d_best: dictionary with the best hyperparameters
        '''
        # Get the best hyperparameters
        model_name = self.model.__name__
        hyperparams = self.d_hyperparams[model_name]

        d_best = self.get_top_k_hyperparams(metric, 1).to_dict('records')[0]

        d_best = {k:v for k,v in d_best.items() for k in hyperparams.keys()}

        return d_best
    
    def get_top_k_hyperparams(self, metric: str, k: int):
        '''
        Input:
            metric: name of the metric to be used
            k: number of hyperparameters to be returned

        Output:
            d_best: dictionary with the top k best hyperparameters
        '''
        # Get the results for each combination of hyperparameters
        if self.df_results is None:
            df_results = self.get_results_hyperparams()
        else:
            df_results = self.df_results

        # Get the best hyperparameters
        df_top_k = df_results.sort_values(by=metric, ascending=False).iloc[:k]

        return df_top_k
        


    def get_results_hyperparams(self):
        '''
        Returns a dataframe with the results of the evaluation
        for each combination of hyperparameters
        '''
        # Get the grid of hyperparameters
        model_name = self.model.__name__
        ls_hyps = self.grid_from_dict(self.d_hyperparams[model_name])

        # Get the scores for each combination of hyperparameters
        ls_scores = []
        for hyps in ls_hyps:
            d_scores = self.score_hyperparams(**hyps)
            ls_scores.append(d_scores)

        # Create a dataframe with the results
        df_results = pd.DataFrame(ls_scores)

        self.df_results = df_results
        return df_results


    def score_hyperparams(self, **kw_hyperparams):
        '''
        Input:
            d_hyperparams: dictionary with the hyperparameters to be used
        Output:
            d_scores: dictionary with the scores for each metric
        '''

        # Get feature matrix X
        X = self.data.drop(self.target, axis=1).copy()
        
        # Instantiate and fit the model with the hyperparameters
        model = self.model(**kw_hyperparams)
        model.fit(X)

        # Get the predicted labels
        y_pred = model.labels_

        # Get the scores iterating over the internal and external scorers
        internal_scorers = self.d_metrics["internal"]
        external_scorers = self.d_metrics["external"]

        d_scores = kw_hyperparams.copy()
        d_scores['n_out_clusters'] = len(set(y_pred))
        d_scores['n_in_classes'] = len(set(self.y_true))
        
        for scorer in external_scorers:
            try:
                score = scorer(self.y_true, y_pred)
                d_scores[scorer.__name__] = score
            except:
                d_scores[scorer.__name__] = None
        
        for scorer in internal_scorers:
            try:
                score = scorer(X, y_pred)
                d_scores[scorer.__name__] = score
            except:
                d_scores[scorer.__name__] = None
        
        return d_scores

    @staticmethod
    def grid_from_dict(d_in):
        '''
        Receives a dictionary of lists of hyperparameters and 
        returns a list of dictionaries with all the possible combinations
        '''
        # Create a list of all possible hyperparameter combinations
        combinations = list(product(*d_in.values()))

        # Print the grid of hyperparameter combinations
        ls_hyps = []
        for combo in combinations:
            hyperparameters = dict(zip(d_in.keys(), combo))
            ls_hyps.append(hyperparameters)
            
        return ls_hyps

In [6]:
from src.clustering.DBSCAN import DBSCAN
model = DBSCAN
evaluator = Evaluator(model, data_loader, target="Type")
evaluator.score_hyperparams(eps=0.5, min_samples=5)
evaluator.get_results_hyperparams()
evaluator.get_top_k_hyperparams("silhouette_score", 2)
evaluator.get_best_hyperparams("silhouette_score")

{'eps': 0.1854298518460819, 'min_samples': 0.1854298518460819}

In [7]:
evaluator.get_top_k_hyperparams("silhouette_score", 5)

Unnamed: 0,eps,min_samples,n_out_clusters,n_in_classes,homogeneity_score,completeness_score,v_measure_score,silhouette_score
67,0.9,5,5,6,0.236938,0.34077,0.279523,0.18543
69,0.9,7,4,6,0.204923,0.324901,0.251327,0.169437
68,0.9,6,4,6,0.204923,0.324901,0.251327,0.169437
70,0.9,8,3,6,0.168327,0.307984,0.217681,0.168011
58,0.8,4,6,6,0.230076,0.323693,0.268972,0.161685
