In [1]:
import pandas as pd
import numpy as np

########################################################################
# Find the highest Macro-averaged Precision and Recall (id and value)  #
#  for the each machine learning algorithm used and target variable    #
# -----------------------------  Tenure  ------------------------------#
########################################################################

df = pd.read_excel("performance_results\excell_performance_results_tenure_algs.xlsx")
metrics = ['Precision', 'Recall']
df_max_avgs_and_ids = pd.DataFrame(columns=[f'{metrics[0]}_max_avg', 
                                            f'{metrics[0]}_max_avg_index', 
                                            f'{metrics[1]}_max_avg', 
                                            f'{metrics[1]}_max_avg_index'])
indexes = [] # "Non-NaN" indexes in Precision
for metric in metrics:
    df_ids_know_all_classes = df.copy(deep=True)

    # Get indexes of Precision and Recall that acknowledge the existence of #
    #  all classes                                                          #
    #
    # (Recall) check only the rows that in 'Precision' have no nans (meaning
    #  the training set, and thus the model, "knows" all 3 classes, at least as
    #   False Positives).
    if metric == metrics[1]:
        #* Note: 'indexes' will have values from previous iterations
        df_ids_know_all_classes = df.loc[indexes] 
    # (Precision) Get rows whose Precision captured 3 results on its "list"
    #  (it's actually a string)  
    if metric == metrics[0]: 
        df_ids_know_all_classes = df_ids_know_all_classes[df_ids_know_all_classes[metric].apply(lambda x: len(str(x).strip("[]").split()) == 3)]
    # Remove the uncessary spaces and brackets and split the string into 
    #  3 strings (the values)
    df_ids_know_all_classes[metric] = df_ids_know_all_classes[metric].apply(lambda x: str(x).strip("[]").split())
    # (Precision) Remove those that contain a 'nan'
    if metric == metrics[0]:
        df_ids_know_all_classes = df_ids_know_all_classes[df_ids_know_all_classes[metric].apply(lambda x: 'nan' not in x)]
        indexes = df_ids_know_all_classes.index.to_list()
    # (Recall) Replace those that contain a 'nan' by 0 
    #  This is because if 'Precision' acknowldged them, then despite there
    #  being no False Negatives, all classes were acknowledged by the model, 
    #  at least as False Positives. Thus, it's fair to use that entry, and also 
    #  to only divide the result by 2, since TP and FN are impossible to measure
    #  in the cases the test set did not have any instances of one of the classes.
    elif metric == metrics[1]:
        df_ids_know_all_classes[metric] = df_ids_know_all_classes[metric].apply(
            lambda x: [string.replace('nan', '0') for string in x]
        )
        
    # Convert list of strings to list of floats
    df_ids_know_all_classes[metric] = df_ids_know_all_classes[metric].apply(
        lambda x: [float(str_float) for str_float in x]
    ) 
    # Get mean of the list of floats
    df_means = df_ids_know_all_classes[['ml_algorithm', metric]].copy()
    if metric == metrics[0]:
        df_means[metric] = df_ids_know_all_classes[metric].apply(np.mean)
    elif metric == metrics[1]: 
        # (Recall) Divide by len(x)-1 since it would be "unfair" to account for 
        # the "Share of freehold" 0
        df_means[metric] = df_ids_know_all_classes[metric].apply(lambda x: sum(x) / (len(x)-1))
        
    # Get id of the max mean for each ml_algorithm
    df_max_avgs_and_ids[f'{metric}_max_avg'] = df_means.groupby(by=['ml_algorithm']).max()
    df_max_avgs_and_ids[f'{metric}_max_avg_index'] = df_means.groupby(by=['ml_algorithm']).idxmax()

# Print best averages and respective indexes 
#  by machine learning method
print(df_max_avgs_and_ids)

              Precision_max_avg  Precision_max_avg_index  Recall_max_avg  \
ml_algorithm                                                               
A                      0.629823                    13112        0.864675   
B                      0.568537                    51856        0.815395   

              Recall_max_avg_index  
ml_algorithm                        
A                            39038  
B                            51856  
