# Evaluation of results

In [1]:
from eval_utils import *

%load_ext autoreload
%aimport eval_utils

In [2]:
eval_methods = ['base', 'tOBT','tABT']
experiments = ['FF', 'FT', 'TF', 'TT']
results_dir = "/Users/duculet/Thesis/NewWork/RecommenderServer/evalsets/lim_100/"
stats_dir = "/Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/"

in_paths = generate_paths(results_dir, eval_methods, experiments)

In [3]:
def run_simple_exp(eval_method, experiment, verbose=False):
    # Setup paths and naming conventions
    in_path = in_paths[eval_method][experiment]
    out_path = stats_dir

    # Generate models
    models = generate_models(in_path, filelimit = 0, mintrans = 0, verbose = verbose)

    # Exclude irrelevant models
    excluded = ['P1855', 'P5192']
    models_kept = [model for model in models if model.model_id not in excluded]
    
    # Generate name
    name = eval_method + "_" + experiment

    # sort models by trans count
    models_by_trans = sorted(models_kept, key = lambda model: model.trans_count)

    # compute statistics for each model, rank by trans count
    stats_by_trans = []
    for idx, model in enumerate(models_by_trans):
        model_stats = model.get_statistics()
        # add idx to model stats
        model_stats['Pos'] = idx
        # add model stats to list
        stats_by_trans.append(model_stats)

    # get the statistics for the entire experiment
    experiment_stats = get_models_simple_stats(models_by_trans)

    if verbose:
        print(name)
        print(experiment_stats)
        print('-' * 20)

    # convert list of series to dataframe and add experiment stats
    statistics = pd.DataFrame(stats_by_trans)

    # add experiment stats to dataframe and differentiate from models
    experiment_stats = pd.DataFrame([experiment_stats], columns=statistics.columns)
    experiment_stats['Pos'] = -1.0  # add negative index to differentiate from models

    # concatenate models and experiment stats
    statistics = pd.concat([experiment_stats, statistics]).reset_index(drop = True)

    # save statistics
    save_stats(out_path, name, statistics)

In [4]:
eval_method = 'tABT'
experiments = ['FF', 'FT', 'TF', 'TT']
verbose = True

# Run experiments
for experiment in experiments:
    print(experiment)
    run_simple_exp(eval_method, experiment, verbose)

FF


Processing files: 100%|██████████| 1060/1060 [06:03<00:00,  2.91it/s] 


1060 models generated
tABT_FF
Mean         1.8298
Median       1.4887
StdDev       1.1380
Top1        70.7192
Top5        96.2196
Top10       98.6489
Missing      0.0030
Duration     0.0074
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FF_stats.csv
FT


Processing files: 100%|██████████| 1061/1061 [05:36<00:00,  3.15it/s] 


1061 models generated
tABT_FT
Mean         1.5738
Median       1.3534
StdDev       0.7794
Top1        75.8690
Top5        97.8243
Top10       99.2349
Missing      0.0031
Duration     0.0276
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FT_stats.csv
TF


Processing files: 100%|██████████| 1058/1058 [05:49<00:00,  3.02it/s] 


1058 models generated
tABT_TF
Mean         1.6807
Median       1.4261
StdDev       0.8752
Top1        73.6173
Top5        97.1556
Top10       98.9881
Missing      0.0032
Duration     0.0185
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TF_stats.csv
TT


Processing files: 100%|██████████| 1059/1059 [05:59<00:00,  2.95it/s] 


1059 models generated
tABT_TT
Mean         1.5293
Median       1.3369
StdDev       0.6823
Top1        77.0860
Top5        98.1101
Top10       99.2896
Missing      0.0031
Duration     0.0356
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TT_stats.csv


In [3]:
def general_stats(stats: pd.DataFrame) -> tuple[int, int, float, float, float, float]:
    # compute total count for weighted averages
    total_count = stats['Count'].sum()
    # compute stats, weighted by counts in each group
    avg_rank = (stats['Mean'] * stats['Count']).sum() / total_count
    avg_rank = round(avg_rank, 4)
    avg_top1 = (stats['Top1'] * stats['Count']).sum() / total_count
    avg_top1 = round(avg_top1, 4)
    avg_top5 = (stats['Top5'] * stats['Count']).sum() / total_count
    avg_top5 = round(avg_top5, 4)
    avg_top10 = (stats['Top10'] * stats['Count']).sum() / total_count
    avg_top10 = round(avg_top10, 4)
    # return stats
    # return -1 to indicate these are group-wide stats
    return -1, total_count, avg_rank, avg_top1, avg_top5, avg_top10

In [4]:
def display_statistics(stats: tuple[int, int, float, float, float, float]) -> None:
    print("Total count: " + str(stats[1]))
    print("Average rank: " + str(stats[2]))
    print("Average top 1: " + str(stats[3]))
    print("Average top 5: " + str(stats[4]))
    print("Average top 10: " + str(stats[5]))

In [5]:
def run_experiment(eval_method: str, experiment: str, groupby: list[str] = None, verbose: bool = False):
    # Setup paths and naming conventions
    in_path = in_paths[eval_method][experiment]
    out_path = stats_dir

    # Generate models
    models = generate_models(in_path, filelimit = 0, mintrans = 0, verbose = verbose)

    # Exclude irrelevant models
    excluded = ['P1855', 'P5192']
    models_kept = [model for model in models if model.model_id not in excluded]

    for group in groupby:
        name = eval_method + "_" + experiment + "_" + group
        # Generate stats
        stats_complete = get_stats(models, group)
        general_stats_complete = general_stats(stats_complete)

        stats_relevant = get_stats(models_kept, group)
        general_stats_relevant = general_stats(stats_relevant)

        if verbose:
            print("Statistics for " + name)
            print("All models")
            display_statistics(general_stats_complete)
            # Print separator
            print('-' * 20)
            print("Relevant models")
            display_statistics(general_stats_relevant)

        # Update stats with general stats as first row
        new_stats = pd.DataFrame([general_stats_relevant], columns=stats_relevant.columns)
        stats_relevant = pd.concat([new_stats, stats_relevant]).reset_index(drop=True)
        # Save stats
        save_stats(out_path, name, stats_relevant)

        # Print separator
        print("#" * 20)
    
    # free memory
    del models
    del models_kept
    del stats_complete
    del stats_relevant
    del general_stats_complete
    del general_stats_relevant
    del new_stats

## Baseline evaluation

In [6]:
# Setup variables
eval_method = 'base'
experiment = 'TT' # doesn't matter in this case
groupby = ['SetSize', 'NumNonTypes', 'NumTypes', 'NumObjTypes', 'NumSubjTypes']

In [7]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files:   0%|          | 0/1059 [00:00<?, ?it/s]

Processing files: 100%|██████████| 1059/1059 [04:08<00:00,  4.26it/s] 


1059 models generated
Statistics for base_TT_SetSize
All models
Total count: 56262350
Average rank: 7.1568
Average top 1: 70.5497
Average top 5: 96.0294
Average top 10: 98.4445
--------------------
Relevant models
Total count: 56256548
Average rank: 1.832
Average top 1: 70.6825
Average top 5: 96.2079
Average top 10: 98.6246
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/base_TT_SetSize_stats.csv
####################
Statistics for base_TT_NumNonTypes
All models
Total count: 56262350
Average rank: 7.1568
Average top 1: 70.5497
Average top 5: 96.0294
Average top 10: 98.4445
--------------------
Relevant models
Total count: 56256548
Average rank: 1.832
Average top 1: 70.6825
Average top 5: 96.2079
Average top 10: 98.6246
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/base_TT_NumNonTypes_stats.csv
####################
Statistics for base_TT_NumTypes
All models
Total count: 56262350
Avera

## Take All But Type evaluation

In [8]:
eval_method = 'tABT'
groupby = ['SetSize', 'NumNonTypes', 'NumTypes', 'NumObjTypes', 'NumSubjTypes']

### No type information (FF)

#### Setup and generate models

In [9]:
# Setup variables
experiment = 'FF'

In [10]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1060/1060 [04:24<00:00,  4.00it/s] 


1060 models generated
Statistics for tABT_FF_SetSize
All models
Total count: 56253705
Average rank: 7.0102
Average top 1: 70.6528
Average top 5: 96.1296
Average top 10: 98.557
--------------------
Relevant models
Total count: 56248238
Average rank: 1.8298
Average top 1: 70.7192
Average top 5: 96.2196
Average top 10: 98.6489
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FF_SetSize_stats.csv
####################
Statistics for tABT_FF_NumNonTypes
All models
Total count: 56253705
Average rank: 7.0102
Average top 1: 70.6528
Average top 5: 96.1296
Average top 10: 98.557
--------------------
Relevant models
Total count: 56248238
Average rank: 1.8298
Average top 1: 70.7192
Average top 5: 96.2196
Average top 10: 98.6489
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FF_NumNonTypes_stats.csv
####################
Statistics for tABT_FF_NumTypes
All models
Total count: 56253705
Avera

### Only object type information (TF)

In [11]:
# Setup variables
experiment = 'TF'

In [12]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1058/1058 [04:21<00:00,  4.05it/s] 


1058 models generated
Statistics for tABT_TF_SetSize
All models
Total count: 56260835
Average rank: 4.5249
Average top 1: 72.3681
Average top 5: 96.6515
Average top 10: 98.6686
--------------------
Relevant models
Total count: 56255219
Average rank: 1.748
Average top 1: 72.5039
Average top 5: 96.8268
Average top 10: 98.8426
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TF_SetSize_stats.csv
####################
Statistics for tABT_TF_NumNonTypes
All models
Total count: 56260835
Average rank: 4.5108
Average top 1: 73.4832
Average top 5: 96.9844
Average top 10: 98.8186
--------------------
Relevant models
Total count: 56255219
Average rank: 1.6807
Average top 1: 73.6173
Average top 5: 97.1556
Average top 10: 98.9881
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TF_NumNonTypes_stats.csv
####################
Statistics for tABT_TF_NumTypes
All models
Total count: 56260835
Aver

### Only subject type information (FT)

In [13]:
# Setup variables
experiment = 'FT'

In [14]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1061/1061 [04:21<00:00,  4.06it/s] 


1061 models generated
Statistics for tABT_FT_SetSize
All models
Total count: 56253327
Average rank: 1.8305
Average top 1: 75.2064
Average top 5: 97.5367
Average top 10: 99.0496
--------------------
Relevant models
Total count: 56247608
Average rank: 1.6022
Average top 1: 75.34
Average top 5: 97.6918
Average top 10: 99.1947
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FT_SetSize_stats.csv
####################
Statistics for tABT_FT_NumNonTypes
All models
Total count: 56253327
Average rank: 1.7101
Average top 1: 75.7561
Average top 5: 97.7019
Average top 10: 99.1308
--------------------
Relevant models
Total count: 56247608
Average rank: 1.5738
Average top 1: 75.869
Average top 5: 97.8243
Average top 10: 99.2349
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_FT_NumNonTypes_stats.csv
####################
Statistics for tABT_FT_NumTypes
All models
Total count: 56253327
Averag

### Both subject and object type information (TT)

In [15]:
# Setup variables
experiment = 'TT'

In [16]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1059/1059 [04:29<00:00,  3.93it/s] 


1059 models generated
Statistics for tABT_TT_SetSize
All models
Total count: 56262350
Average rank: 2.7582
Average top 1: 75.7317
Average top 5: 97.5522
Average top 10: 99.0001
--------------------
Relevant models
Total count: 56256548
Average rank: 1.5923
Average top 1: 75.8613
Average top 5: 97.6952
Average top 10: 99.132
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TT_SetSize_stats.csv
####################
Statistics for tABT_TT_NumNonTypes
All models
Total count: 56262350
Average rank: 2.7143
Average top 1: 76.9725
Average top 5: 97.9913
Average top 10: 99.1891
--------------------
Relevant models
Total count: 56256548
Average rank: 1.5293
Average top 1: 77.086
Average top 5: 98.1101
Average top 10: 99.2896
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tABT_TT_NumNonTypes_stats.csv
####################
Statistics for tABT_TT_NumTypes
All models
Total count: 56262350
Avera