Since we have unbalanced classes in df_performance, let's use k-medoids on the majority class with k = size of the minority class. We keep each medoid as a way to balance the classes in the train datasets

In [None]:
#########################################################################################################################################################################################################

# This function is used to extract intervals of Good and Bad instances
# for each dataset using each performance list balanced with kmedoids
def auto_extraction_balanced(list_df_data, list_df_metadata, list_df_performance):
    
    # Let's define the good and bad interval dictionary
    dict_G = {}
    dict_B = {}

    # We'll be evaluating the domains of competence of each performance measure within the dataset
    
    # Let's go through each class (performance)
    for list_index in range(len(list_df_performance)):
        
        # Let's get the current performance name
        performance = list(list_df_performance[list_index].columns)[0]
        
        # Let's create an empty dictionary for this performance measure
        dict_G[performance] = {}
        dict_B[performance] = {}

        # Let's get the meta feature names for the current dataset
        meta_feature_names = list(list_df_metadata[list_index].columns)

        # let's go through each column of the meta features (MFj)
        for metafeature in meta_feature_names:
            
            # Let's create an empty dictionary for this metafeature 
            # in this algorithm
            dict_G[performance][metafeature] = {}
            dict_B[performance][metafeature] = {}
            
            # Get interval values and indexes
            G_int_ind, G_int_val, B_int_ind, B_int_val = get_intervals(list_df_data[list_index], list_df_metadata[list_index][metafeature], list_df_performance[list_index])
            
            # Add G_aux and B_aux to dictionaries according to the current 
            # algorithm, metafeature and interval        
            dict_G[performance][metafeature]['interval_ind'] = G_int_ind
            dict_G[performance][metafeature]['interval_val'] = G_int_val
            dict_B[performance][metafeature]['interval_ind'] = B_int_ind
            dict_B[performance][metafeature]['interval_val'] = B_int_val
        
    return dict_G, dict_B

In [None]:
import numpy as np
import preprocess as pp

list_df_train_balanced = []
list_df_metadata_train_balanced = []
list_df_performance_train_balanced = []

# for each performance measure:
for column in df_performance_train:
    
    #data, metadata, performance  = pp.kmedoids_balance(column, df_performance_train, df_metadata_train, df_train)
    
    data, metadata, performance  = pp.random_balance(column, df_performance_train, df_metadata_train, df_train)

    list_df_train_balanced.append(data)
    list_df_metadata_train_balanced.append(metadata)
    list_df_performance_train_balanced.append(performance)
    
    

AEM

In [1]:
# dict_G, dict_B, = aem.auto_extraction_balanced(list_df_train_balanced, 
#                                      list_df_metadata_train_balanced, 
#                                      list_df_performance_train_balanced)

# visualize the tables of intervals obtained by the AEM for each performance measure
# for performance in dict_good_rules:
#    print(f"{performance}")
#    aem.display_side_by_side(dict_good_rules[performance] ,dict_bad_rules[performance], titles=['Good','Bad'])


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import cf_matrix as cm

# Let's get the performance names for the current dataset
performance_names = list(df_performance_test.columns)

for performance in performance_names:
    dict_pred[performance].PRD = dict_pred[performance].PRD == 1
    dict_pred[performance].NRD = dict_pred[performance].NRD == 1
    
    dict_pred[performance]['PRD_not_NRD'] = dict_pred[performance].PRD & ~dict_pred[performance].NRD
    dict_pred[performance]['NRD_not_PRD'] = ~dict_pred[performance].PRD & dict_pred[performance].NRD
    
    # Which rules are used to classify Good and Bad instances
    mask1 = dict_pred[performance].PRD_not_NRD == True
    mask2 = dict_pred[performance].NRD_not_PRD == True
    
    dict_pred[performance]['Final'] = [-1] * len(df_metadata_test)
    
    dict_pred[performance].loc[mask2, ('Final')] = 0
    dict_pred[performance].loc[mask1, ('Final')] = 1

for performance in performance_names:
    matrix = confusion_matrix(df_performance_test[performance], dict_pred[performance].Final)
    labels = ['True Neg','False Pos','False Neg','True Pos']
    #categories = ['Bad', 'Good']
    categories = ['None', 'Bad', 'Good']
    cm.make_confusion_matrix(matrix, 
                      group_names=labels,
                      categories=categories, 
                      cmap='Blues',
                      title = performance)

In [None]:
# This function is used to extract intervals of original features of Good and Bad instances
# for each dataset using each performance column
def auto_extraction_original(df_data, df_performance, percent_drop, percent_merge):
    
    # Let's define the good and bad interval dictionary
    dict_G = {}
    dict_B = {}

    # We'll be evaluating the domains of competence of each performance measure within the dataset
    # Let's get the performance names for the current dataset
    performance_names = list(df_performance.columns)
    
    # Let's go through each class (performance)
    for performance in performance_names:

        # Let's create an empty dictionary for this performance measure
        dict_G[performance] = {}
        dict_B[performance] = {}

        # Let's get the original feature names for the current dataset
        original_feature_names = list(df_data.columns)

        # let's go through each column of the original features (OFj)
        for feature in original_feature_names:
            
            # Let's create an empty dictionary for this feature 
            # in this algorithm
            dict_G[performance][feature] = {}
            dict_B[performance][feature] = {}
            
            # Get interval values and indexes
            if isinstance(percent_drop, pd.Series):
                    G_int_ind, G_int_val, B_int_ind, B_int_val = get_intervals(df_data, df_data[feature], df_performance[performance], percent_drop[performance], percent_merge[performance])
                    
            else: 
                G_int_ind, G_int_val, B_int_ind, B_int_val = get_intervals(df_data, df_data[feature], df_performance[performance], percent_drop, percent_merge)
            
            # Add G_aux and B_aux to dictionaries according to the current 
            # algorithm, metafeature and interval        
            dict_G[performance][feature]['interval_ind'] = G_int_ind
            dict_G[performance][feature]['interval_val'] = G_int_val
            dict_B[performance][feature]['interval_ind'] = B_int_ind
            dict_B[performance][feature]['interval_val'] = B_int_val
    
    
    return dict_G, dict_B

In [None]:
# This function is used to lay tables side-by-side in Jupyter-Notebook
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)
    

In [None]:
#########################################################################################################################################################################################################

# This function returns the support for PRD and NRD
def rd_support(dict_good_rules, dict_bad_rules, n):

    for performance in dict_good_rules:
        print(f"{performance}")

        df_good = dict_good_rules[performance].sort_values('Support', ascending=False)
        df_bad = dict_bad_rules[performance].sort_values('Support', ascending=False)

        # Turn to list of intervals with indexes
        list_int_ind_good = list(df_good["Index"])
        list_int_ind_bad = list(df_bad["Index"])

        list_intervals_good = []
        list_intervals_bad = []

        # Positive Rule Disjunction (PRD)
        for int_ind in list_int_ind_good:
            list_intervals_good.append(list(range(int_ind[0], int_ind[1])))

        # Negative Rule Disjunction (NRD)
        for int_ind in list_int_ind_bad:
            list_intervals_bad.append(list(range(int_ind[0], int_ind[1])))


        # Union of intervals
        final_list_good = list(set().union(*list_intervals_good))
        final_list_bad = list(set().union(*list_intervals_bad))

        support_good = round((len(final_list_good)/n), 2)
        support_bad = round((len(final_list_bad)/n), 2)

        print("Good: ", support_good, "Bad: ", support_bad, "\n")

In [None]:
def kmedoids_balance(column, df_performance_train, df_metadata_train, df_train):
    
    # find out what is the minority and majority class
    count = df_performance_train[column].value_counts()
    majority_class = count[count == max(count)].index[0]
    minority_class = count[count == min(count)].index[0]

    # separate a dataframe with only the majority class and get its indexes
    df_performance_train_majority = df_performance_train[column][df_performance_train[column] ==  majority_class]
    train_majority_index = np.array(df_performance_train_majority.index)

    # separate a dataframe with only the minority class and get its indexes
    df_performance_train_minority = df_performance_train[column][df_performance_train[column] ==  minority_class]
    train_minority_index = np.array(df_performance_train_minority.index)

    # use the majority and minority indexes to separate the metadata train set
    df_metadata_train_majority = df_metadata_train.iloc[train_majority_index]
    df_metadata_train_minority = df_metadata_train.iloc[train_minority_index]

    # get the cluster centers (medoids) for the majority class
    kmedoids = KMedoids(n_clusters=count[minority_class], random_state=0).fit(df_metadata_train_majority.to_numpy())

    # get the indexes for minority and medoids
    medoids_indexes = np.array(df_metadata_train_majority.iloc[kmedoids.medoid_indices_].index)
    index_balanced = np.concatenate((medoids_indexes, np.array(df_metadata_train_minority.index)))

    # separate data, metadata and performance with index for the balanced set
    df_train_balanced = df_train.loc[index_balanced].sort_index()
    df_metadata_train_balanced = df_metadata_train.loc[index_balanced].sort_index()
    df_performance_train_balanced = pd.DataFrame(df_performance_train.loc[index_balanced, column].sort_index(), columns = [column])

    return df_train_balanced, df_metadata_train_balanced, df_performance_train_balanced

############################################################################################
    
def random_balance(column, df_performance_train, df_metadata_train, df_train):
    
    # find out what is the minority and majority class
    count = df_performance_train[column].value_counts()
    majority_class = count[count == max(count)].index[0]
    minority_class = count[count == min(count)].index[0]

    # separate a dataframe with only the majority class and get its indexes
    df_performance_train_majority = df_performance_train[column][df_performance_train[column] ==  majority_class]
    train_majority_index = np.array(df_performance_train_majority.index)

    # separate a dataframe with only the minority class and get its indexes
    df_performance_train_minority = df_performance_train[column][df_performance_train[column] ==  minority_class]
    train_minority_index = np.array(df_performance_train_minority.index)

    # use the majority and minority indexes to separate the metadata train set
    df_metadata_train_majority = df_metadata_train.iloc[train_majority_index]
    df_metadata_train_minority = df_metadata_train.iloc[train_minority_index]
    
    # get the indexes for the majority class random sample and minority class
    
    # randomly sample majority class with the size of minority class
    majority_sample_indexes = np.array(random.sample(list(train_majority_index), count[minority_class]))
    index_balanced = np.concatenate((majority_sample_indexes, np.array(df_metadata_train_minority.index)))

    # separate data, metadata and performance with index for the balanced set
    df_train_balanced = df_train.loc[index_balanced].sort_index()
    df_metadata_train_balanced = df_metadata_train.loc[index_balanced].sort_index()
    df_performance_train_balanced = pd.DataFrame(df_performance_train.loc[index_balanced, column].sort_index(), columns = [column])

    return df_train_balanced, df_metadata_train_balanced, df_performance_train_balanced


Let's evaluate the performance of the ruleset

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import cf_matrix as cm
import os
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score



# Change plot font size and update to save as latex
#matplotlib.use("png")
plt.rcParams.update({
    'font.size': 13
})


# Let's get the performance names for the current dataset
#performance_names = list(df_performance_test.columns)

performance = "IsBetaEasy"

dict_pred[performance].PRD = dict_pred[performance].PRD == 1
dict_pred[performance].NRD = dict_pred[performance].NRD == 1

dict_pred[performance]['PRD_not_NRD'] = dict_pred[performance].PRD & ~dict_pred[performance].NRD
dict_pred[performance]['NRD_not_PRD'] = ~dict_pred[performance].PRD & dict_pred[performance].NRD

# Which rules are used to classify easy and hard instances
mask1 = dict_pred[performance].PRD_not_NRD == True
mask2 = dict_pred[performance].NRD_not_PRD == True

dict_pred[performance]['Final'] = [-1] * len(df_metadata_train)

dict_pred[performance].loc[mask2, ('Final')] = 0
dict_pred[performance].loc[mask1, ('Final')] = 1

matrix = confusion_matrix(df_performance_train[performance], dict_pred[performance].Final)
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['None', 'Hard', 'Easy']
    
# Get direc name
direc_name = os.path.basename(direc)
    
cm.make_confusion_matrix(matrix, 
                  group_names=labels,
                  categories=categories, 
                  cmap='Blues',
                  sum_stats=False
                        )

# f1 score
f1 = f1_score(df_performance_train["IsBetaEasy"], 
               dict_pred["IsBetaEasy"].Final, 
               average='weighted')

print(f"Weighted F1 Score Train: {round(df_best.loc['f1', 'IsBetaEasy'], 2)}")
print(f"Weighted F1 Score Test: {round(f1, 2)}")

plt.savefig(f'{direc}/Images/performance_{"IsBetaEasy"}_{direc_name}.pgf')