# Evaluation of results

In [1]:
from eval_utils import *

%load_ext autoreload
%aimport eval_utils

In [2]:
eval_methods = ['base', 'tOBT','tABT']
experiments = ['FF', 'FT', 'TF', 'TT']
results_dir = "/Users/duculet/Thesis/NewWork/RecommenderServer/evalsets/lim_100/"
stats_dir = "/Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/"

in_paths = generate_paths(results_dir, eval_methods, experiments)

In [17]:
def run_simple_exp(eval_method, experiment, verbose=False):
    # Setup paths and naming conventions
    in_path = in_paths[eval_method][experiment]
    out_path = stats_dir

    # Generate models
    models = generate_models(in_path, filelimit = 0, mintrans = 0, verbose = verbose)

    # Exclude irrelevant models
    excluded = ['P1855', 'P5192']
    models_kept = [model for model in models if model.model_id not in excluded]
    
    # Generate name
    name = eval_method + "_" + experiment

    # sort models by trans count
    models_by_trans = sorted(models_kept, key = lambda model: model.trans_count)

    # compute statistics for each model, rank by trans count
    stats_by_trans = []
    for idx, model in enumerate(models_by_trans):
        model_stats = model.get_statistics() 
        # add idx to model stats
        model_stats['Pos'] = idx
        # add model stats to list
        stats_by_trans.append(model_stats)

    # get the statistics for the entire experiment
    experiment_stats = get_models_simple_stats(models_by_trans)

    if verbose:
        print(name)
        print(experiment_stats)
        print('-' * 20)

    # convert list of series to dataframe and add experiment stats
    statistics = pd.DataFrame(stats_by_trans)

    # add experiment stats to dataframe and differentiate from models
    experiment_stats = pd.DataFrame([experiment_stats], columns=statistics.columns)
    experiment_stats['Pos'] = -1.0  # add negative index to differentiate from models

    # concatenate models and experiment stats
    statistics = pd.concat([experiment_stats, statistics]).reset_index(drop = True)

    # save statistics
    save_stats(out_path, name, statistics)

In [18]:
eval_method = 'tOBT'
experiments = ['FF', 'FT', 'TF', 'TT']
verbose = True

# Run experiments
for experiment in experiments:
    print(experiment)
    run_simple_exp(eval_method, experiment, verbose)

FF


Processing files: 100%|██████████| 1060/1060 [05:30<00:00,  3.21it/s] 


1060 models generated
tOBT_FF
Mean         5.1816
Median       3.1143
StdDev       7.0066
Top1        52.6749
Top5        77.4949
Top10       87.8853
Missing      0.0030
Duration     0.0092
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FF_stats.csv
FT


Processing files: 100%|██████████| 1061/1061 [05:43<00:00,  3.09it/s] 


1061 models generated
tOBT_FT
Mean         1.3535
Median       1.0812
StdDev       0.6670
Top1        89.7055
Top5        98.2256
Top10       99.3074
Missing      0.0031
Duration     0.0364
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FT_stats.csv
TF


Processing files: 100%|██████████| 1058/1058 [05:18<00:00,  3.32it/s] 


1058 models generated
tOBT_TF
Mean         1.4301
Median       1.0972
StdDev       0.7393
Top1        87.6284
Top5        97.8587
Top10       99.1271
Missing      0.0032
Duration     0.0237
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TF_stats.csv
TT


Processing files: 100%|██████████| 1059/1059 [04:30<00:00,  3.92it/s] 


1059 models generated
tOBT_TT
Mean         1.3132
Median       1.0666
StdDev       0.5918
Top1        91.1623
Top5        98.4774
Top10       99.3390
Missing      0.0031
Duration     0.0474
dtype: float64
--------------------
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TT_stats.csv


In [3]:
def general_stats(stats: pd.DataFrame) -> tuple[int, int, float, float, float, float]:
    # compute total count for weighted averages
    total_count = stats['Count'].sum()
    # compute stats, weighted by counts in each group
    avg_rank = (stats['Mean'] * stats['Count']).sum() / total_count
    avg_rank = round(avg_rank, 4)
    avg_top1 = (stats['Top1'] * stats['Count']).sum() / total_count
    avg_top1 = round(avg_top1, 4)
    avg_top5 = (stats['Top5'] * stats['Count']).sum() / total_count
    avg_top5 = round(avg_top5, 4)
    avg_top10 = (stats['Top10'] * stats['Count']).sum() / total_count
    avg_top10 = round(avg_top10, 4)
    # return stats
    # return -1 to indicate these are group-wide stats
    return -1, total_count, avg_rank, avg_top1, avg_top5, avg_top10

In [4]:
def display_statistics(stats: tuple[int, int, float, float, float, float]) -> None:
    print("Total count: " + str(stats[1]))
    print("Average rank: " + str(stats[2]))
    print("Average top 1: " + str(stats[3]))
    print("Average top 5: " + str(stats[4]))
    print("Average top 10: " + str(stats[5]))

In [5]:
def run_experiment(eval_method: str, experiment: str, groupby: list[str] = None, verbose: bool = False):
    # Setup paths and naming conventions
    in_path = in_paths[eval_method][experiment]
    out_path = stats_dir

    # Generate models
    models = generate_models(in_path, filelimit = 0, mintrans = 0, verbose = verbose)

    # Exclude irrelevant models
    excluded = ['P1855', 'P5192']
    models_kept = [model for model in models if model.model_id not in excluded]

    for group in groupby:
        name = eval_method + "_" + experiment + "_" + group
        # Generate stats
        stats_complete = get_stats(models, group)
        general_stats_complete = general_stats(stats_complete)

        stats_relevant = get_stats(models_kept, group)
        general_stats_relevant = general_stats(stats_relevant)

        if verbose:
            print("Statistics for " + name)
            print("All models")
            display_statistics(general_stats_complete)
            # Print separator
            print('-' * 20)
            print("Relevant models")
            display_statistics(general_stats_relevant)

        # Update stats with general stats as first row
        new_stats = pd.DataFrame([general_stats_relevant], columns=stats_relevant.columns)
        stats_relevant = pd.concat([new_stats, stats_relevant]).reset_index(drop=True)
        # Save stats
        save_stats(out_path, name, stats_relevant)

        # Print separator
        print("#" * 20)
    
    # free memory
    del models
    del models_kept
    del stats_complete
    del stats_relevant
    del general_stats_complete
    del general_stats_relevant
    del new_stats

## Baseline evaluation

In [6]:
# Setup variables
eval_method = 'base'
experiment = 'TT' # doesn't matter in this case
groupby = ['SetSize', 'NumNonTypes', 'NumTypes', 'NumObjTypes', 'NumSubjTypes']

In [7]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files:   0%|          | 0/1059 [00:00<?, ?it/s]

Processing files: 100%|██████████| 1059/1059 [04:18<00:00,  4.10it/s] 


1059 models generated
Statistics for base_TT_SetSize
All models
Total count: 56262350
Average rank: 7.1568
Average top 1: 70.5497
Average top 5: 96.0294
Average top 10: 98.4445
--------------------
Relevant models
Total count: 56256548
Average rank: 1.832
Average top 1: 70.6825
Average top 5: 96.2079
Average top 10: 98.6246
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/base_TT_SetSize_stats.csv
####################
Statistics for base_TT_NumNonTypes
All models
Total count: 56262350
Average rank: 7.1568
Average top 1: 70.5497
Average top 5: 96.0294
Average top 10: 98.4445
--------------------
Relevant models
Total count: 56256548
Average rank: 1.832
Average top 1: 70.6825
Average top 5: 96.2079
Average top 10: 98.6246
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/base_TT_NumNonTypes_stats.csv
####################
Statistics for base_TT_NumTypes
All models
Total count: 56262350
Avera

## Take One But Type evaluation

In [8]:
eval_method = 'tOBT'
groupby = ['SetSize', 'NumNonTypes', 'NumTypes', 'NumObjTypes', 'NumSubjTypes']

### No type information (FF)

#### Setup and generate models

In [9]:
# Setup variables
experiment = 'FF'

In [10]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1060/1060 [04:21<00:00,  4.05it/s] 


1060 models generated
Statistics for tOBT_FF_SetSize
All models
Total count: 56253705
Average rank: 11.6136
Average top 1: 38.7296
Average top 5: 69.6657
Average top 10: 83.3747
--------------------
Relevant models
Total count: 56248238
Average rank: 6.6764
Average top 1: 38.7625
Average top 5: 69.7273
Average top 10: 83.449
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FF_SetSize_stats.csv
####################
Statistics for tOBT_FF_NumNonTypes
All models
Total count: 56253705
Average rank: 11.6136
Average top 1: 38.7296
Average top 5: 69.6657
Average top 10: 83.3747
--------------------
Relevant models
Total count: 56248238
Average rank: 6.6764
Average top 1: 38.7625
Average top 5: 69.7273
Average top 10: 83.449
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FF_NumNonTypes_stats.csv
####################
Statistics for tOBT_FF_NumTypes
All models
Total count: 56253705
Ave

### Only object type information (TF)

In [11]:
# Setup variables
experiment = 'TF'

In [12]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1058/1058 [04:32<00:00,  3.88it/s] 


1058 models generated
Statistics for tOBT_TF_SetSize
All models
Total count: 56260835
Average rank: 4.627
Average top 1: 80.6691
Average top 5: 96.218
Average top 10: 98.453
--------------------
Relevant models
Total count: 56255219
Average rank: 1.7126
Average top 1: 80.8215
Average top 5: 96.39
Average top 10: 98.6257
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TF_SetSize_stats.csv
####################
Statistics for tOBT_TF_NumNonTypes
All models
Total count: 56260835
Average rank: 4.471
Average top 1: 81.6449
Average top 5: 96.4135
Average top 10: 98.503
--------------------
Relevant models
Total count: 56255219
Average rank: 1.6753
Average top 1: 81.7936
Average top 5: 96.5797
Average top 10: 98.6695
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TF_NumNonTypes_stats.csv
####################
Statistics for tOBT_TF_NumTypes
All models
Total count: 56260835
Average ra

### Only subject type information (FT)

In [13]:
# Setup variables
experiment = 'FT'

In [14]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files:   0%|          | 0/1061 [00:00<?, ?it/s]

Processing files: 100%|██████████| 1061/1061 [04:33<00:00,  3.87it/s] 


1061 models generated
Statistics for tOBT_FT_SetSize
All models
Total count: 56253327
Average rank: 1.6961
Average top 1: 85.8374
Average top 5: 97.4289
Average top 10: 98.854
--------------------
Relevant models
Total count: 56247608
Average rank: 1.4836
Average top 1: 85.9875
Average top 5: 97.5791
Average top 10: 98.9947
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FT_SetSize_stats.csv
####################
Statistics for tOBT_FT_NumNonTypes
All models
Total count: 56253327
Average rank: 1.7285
Average top 1: 84.4403
Average top 5: 97.1421
Average top 10: 98.7789
--------------------
Relevant models
Total count: 56247608
Average rank: 1.5484
Average top 1: 84.5569
Average top 5: 97.2537
Average top 10: 98.8765
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_FT_NumNonTypes_stats.csv
####################
Statistics for tOBT_FT_NumTypes
All models
Total count: 56253327
Aver

### Both subject and object type information (TT)

In [15]:
# Setup variables
experiment = 'TT'

In [16]:
# Run experiment
run_experiment(eval_method, experiment, groupby, verbose = True)

Processing files: 100%|██████████| 1059/1059 [06:04<00:00,  2.91it/s] 


1059 models generated
Statistics for tOBT_TT_SetSize
All models
Total count: 56262350
Average rank: 2.6571
Average top 1: 87.0073
Average top 5: 97.5135
Average top 10: 98.8718
--------------------
Relevant models
Total count: 56256548
Average rank: 1.4673
Average top 1: 87.1557
Average top 5: 97.6551
Average top 10: 99.0048
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TT_SetSize_stats.csv
####################
Statistics for tOBT_TT_NumNonTypes
All models
Total count: 56262350
Average rank: 2.6982
Average top 1: 86.5911
Average top 5: 97.4748
Average top 10: 98.8784
--------------------
Relevant models
Total count: 56256548
Average rank: 1.4855
Average top 1: 86.7114
Average top 5: 97.5852
Average top 10: 98.9749
Saved results to /Users/duculet/Thesis/NewWork/RecommenderServer/evaluation/python/statistics/full/tOBT_TT_NumNonTypes_stats.csv
####################
Statistics for tOBT_TT_NumTypes
All models
Total count: 56262350
Ave