In [1]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl # Import base matplotlib

In [2]:
def parse_parameters(parameter_str):
    # Extract the main minimizer and its parameters
    minimizer_match = re.match(r"(\w+)\((.*)\)", parameter_str)
    if not minimizer_match:
        return {}

    minimizer_name = minimizer_match.group(1)
    parameters_str = minimizer_match.group(2)

    # Extract individual parameters using regular expressions
    params = {}
    param_pattern = re.compile(r'(\w+)=([^,]+(?:\([^)]*\))?)')
    for match in param_pattern.finditer(parameters_str):
        key = match.group(1).strip()
        value = match.group(2).strip()
        # Check if value should be converted to float, int, or kept as is
        if re.match(r'^-?\d+(\.\d+)?$', value):
            if '.' in value:
                value = float(value)
            else:
                value = int(value)
        elif value.lower() == 'true':
            value = True
        elif value.lower() == 'false':
            value = False
        params[key] = value

    # Include the minimizer name
    params['minimizer_name'] = minimizer_name
    return params

In [3]:
# Please provide the full path to the 'batch_results.csv' file for the EASE model on the Netflix dataset.
ease_netflix = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

# Please provide the full path to the 'batch_results.csv' file for the EASE model on the MovieLens dataset.
ease_movielens = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

# Please provide the full path to the 'batch_results.csv' file for the EASE model on the MSD dataset.
ease_msd = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

# Please provide the full path to the 'batch_results.csv' file for the ItemKNN model on the Netflix dataset.
itemknn_netflix = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

# Please provide the full path to the 'batch_results.csv' file for the ItemKNN model on the MovieLens dataset.
itemknn_movielens = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

# Please provide the full path to the 'batch_results.csv' file for the ItemKNN model on the MSD dataset.
itemknn_msd = 'YOUR_FILE_PATH_HERE' # <-- EDIT THIS LINE

In [5]:
ease_netflix = pd.read_csv(ease_netflix)
ease_movielens = pd.read_csv(ease_movielens)
ease_msd = pd.read_csv(ease_msd)
itemknn_netflix = pd.read_csv(itemknn_netflix)
itemknn_movielens = pd.read_csv(itemknn_movielens)
itemknn_msd = pd.read_csv(itemknn_msd)

In [6]:
# Apply the parse_parameters function to create new columns in the dataframe
parameters_df = ease_netflix['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
ease_netflix = pd.concat([ease_netflix, parameters_df], axis=1)

In [7]:
# Apply the parse_parameters function to create new columns in the dataframe
parameters_df = ease_movielens['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
ease_movielens = pd.concat([ease_movielens, parameters_df], axis=1)

In [8]:
# Apply the parse_parameters function to create new columns in the dataframe
parameters_df = ease_msd['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
ease_msd = pd.concat([ease_msd, parameters_df], axis=1)

In [9]:
parameters_df = itemknn_netflix['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
itemknn_netflix = pd.concat([itemknn_netflix, parameters_df], axis=1)

In [10]:
parameters_df = itemknn_movielens['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
itemknn_movielens = pd.concat([itemknn_movielens, parameters_df], axis=1)

In [11]:
parameters_df = itemknn_msd['minimizer_identifier'].apply(parse_parameters)
parameters_df = pd.json_normalize(parameters_df)
itemknn_msd = pd.concat([itemknn_msd, parameters_df], axis=1)

In [12]:
# Choose a specific eta value to filter the data:
eta = 1.0

In [13]:
# Select only algorithms with eta=1.0
# Select only Greedy algorithms (forward, beam forward and backward)
greedy_algorithms = ['GreedyForwardMinimizer', 'GreedyBeamForwardMinimizer', 'GreedyBackwardMinimizer']

# Apply both filters at once
ease_netflix_filtered = ease_netflix[
    (ease_netflix['eta'] == eta) &
    (ease_netflix['minimizer_name'].isin(greedy_algorithms))
].copy()

In [14]:
# Apply both filters at once
ease_movielens_filtered = ease_movielens[
    (ease_movielens['eta'] == eta) &
    (ease_movielens['minimizer_name'].isin(greedy_algorithms))
].copy()

In [15]:
# Apply both filters at once
ease_msd_filtered = ease_msd[
    (ease_msd['eta'] == eta) &
    (ease_msd['minimizer_name'].isin(greedy_algorithms))
].copy()

In [16]:
itemknn_netflix_filtered = itemknn_netflix[
    (itemknn_netflix['eta'] == eta) &
    (itemknn_netflix['minimizer_name'].isin(greedy_algorithms))
].copy()

In [17]:
itemknn_movielens_filtered = itemknn_movielens[
    (itemknn_movielens['eta'] == eta) &
    (itemknn_movielens['minimizer_name'].isin(greedy_algorithms))
].copy()

In [18]:
itemknn_msd_filtered = itemknn_msd[
    (itemknn_msd['eta'] == eta) &
    (itemknn_msd['minimizer_name'].isin(greedy_algorithms))
].copy()

In [19]:
ease_netflix_filtered.head()

Unnamed: 0.1,Unnamed: 0,minimizer_identifier,estimator_identifier,batch_id,batch_number_of_users,batch_number_of_users_processed,batch_percentage_of_users_processed,batch_original_number_of_input_interactions,batch_minimized_number_of_input_interactions,batch_number_of_target_interactions,...,max_size,metric,model,density,l2,remove_history,timeout,timeout_manager,minimizer_name,beam_depth
2,2,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.4)",0,27058,27058,1.0,883633,878020,3462857,...,,NDCG_100,EASE(alpha=0,0.003,2400),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
5,5,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.4)",1,7105,7105,1.0,898738,898687,909440,...,,NDCG_100,EASE(alpha=0,0.003,2400),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
12,12,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.4)",2,3281,3281,1.0,707148,707131,419968,...,,NDCG_100,EASE(alpha=0,0.003,2400),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
13,13,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.4)",4,886,886,1.0,346979,346975,113408,...,,NDCG_100,EASE(alpha=0,0.003,2400),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
14,14,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.4)",3,1670,1670,1.0,506480,506474,213760,...,,NDCG_100,EASE(alpha=0,0.003,2400),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,


In [20]:
itemknn_movielens_filtered.head()

Unnamed: 0.1,Unnamed: 0,minimizer_identifier,estimator_identifier,batch_id,batch_number_of_users,batch_number_of_users_processed,batch_percentage_of_users_processed,batch_original_number_of_input_interactions,batch_minimized_number_of_input_interactions,batch_number_of_target_interactions,...,model,normalize_X,normalize_sim,pop_discount,similarity,remove_history,timeout,timeout_manager,minimizer_name,beam_depth
10,10,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.2)",0,6873,6873,1.0,164942,164940,879744,...,ItemKNN(K=600,True,True,,cosine),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
16,16,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.2)",1,1860,1860,1.0,143262,143261,238080,...,ItemKNN(K=600,True,True,,cosine),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
22,22,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.2)",2,734,734,1.0,96748,96748,93952,...,ItemKNN(K=600,True,True,,cosine),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
24,24,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.2)",3,345,345,1.0,64169,64169,44160,...,ItemKNN(K=600,True,True,,cosine),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,
25,25,"GreedyForwardMinimizer(eta=1.0,max_size=None,m...","ExponentialDecayEstimator(K=128, gamma=-0.2)",4,188,188,1.0,45129,45129,24064,...,ItemKNN(K=600,True,True,,cosine),True,345600,<minipack.minimizers.base.TimeoutManager objec...,GreedyForwardMinimizer,


In [19]:
# Aggregate (add across batches for each distinct algorithm) batch_number_of_users, batch_number_of_users_processed, batch_original_number_of_input_interactions, batch_sample_count, batch_minimized_number_of_input_interactions. Remove all other columns.
# Define the columns to be aggregated
columns_to_sum = [
    'batch_number_of_users',
    'batch_number_of_users_processed',
    'batch_original_number_of_input_interactions',
    'batch_sample_count',
    'batch_minimized_number_of_input_interactions'
]

ease_netflix_aggregated = ease_netflix_filtered.groupby('minimizer_name')[columns_to_sum].sum()
ease_movielens_aggregated = ease_movielens_filtered.groupby('minimizer_name')[columns_to_sum].sum()
ease_msd_aggregated = ease_msd_filtered.groupby('minimizer_name')[columns_to_sum].sum()
itemknn_netflix_aggregated = itemknn_netflix_filtered.groupby('minimizer_name')[columns_to_sum].sum()
itemknn_movielens_aggregated = itemknn_movielens_filtered.groupby('minimizer_name')[columns_to_sum].sum()
itemknn_msd_aggregated = itemknn_msd_filtered.groupby('minimizer_name')[columns_to_sum].sum()

In [20]:
#  Calculate the minimization ratio (dividing batch_minimized_number_of_input_interactions by batch_original_number_of_input_interactions) and the sample efficiency by batch_sample_count / batch_number_of_users.
# Handle potential division by zero: if original interactions is 0, the ratio is undefined (NaN)
ease_netflix_aggregated['minimization_ratio'] = ease_netflix_aggregated['batch_minimized_number_of_input_interactions'] / ease_netflix_aggregated['batch_original_number_of_input_interactions']
ease_movielens_aggregated['minimization_ratio'] = ease_movielens_aggregated['batch_minimized_number_of_input_interactions'] / ease_movielens_aggregated['batch_original_number_of_input_interactions']
ease_msd_aggregated['minimization_ratio'] = ease_msd_aggregated['batch_minimized_number_of_input_interactions'] / ease_msd_aggregated['batch_original_number_of_input_interactions']
itemknn_netflix_aggregated['minimization_ratio'] = itemknn_netflix_aggregated['batch_minimized_number_of_input_interactions'] / itemknn_netflix_aggregated['batch_original_number_of_input_interactions']
itemknn_movielens_aggregated['minimization_ratio'] = itemknn_movielens_aggregated['batch_minimized_number_of_input_interactions'] / itemknn_movielens_aggregated['batch_original_number_of_input_interactions']
itemknn_msd_aggregated['minimization_ratio'] = itemknn_msd_aggregated['batch_minimized_number_of_input_interactions'] / itemknn_msd_aggregated['batch_original_number_of_input_interactions']

# Calculate Sample Efficiency
# Handle potential division by zero: if number of users is 0, efficiency is undefined (NaN)
ease_netflix_aggregated['sample_efficiency'] = ease_netflix_aggregated['batch_sample_count'] / ease_netflix_aggregated['batch_number_of_users']
ease_movielens_aggregated['sample_efficiency'] = ease_movielens_aggregated['batch_sample_count'] / ease_movielens_aggregated['batch_number_of_users']
ease_msd_aggregated['sample_efficiency'] = ease_msd_aggregated['batch_sample_count'] / ease_msd_aggregated['batch_number_of_users']
itemknn_netflix_aggregated['sample_efficiency'] = itemknn_netflix_aggregated['batch_sample_count'] / itemknn_netflix_aggregated['batch_number_of_users']
itemknn_movielens_aggregated['sample_efficiency'] = itemknn_movielens_aggregated['batch_sample_count'] / itemknn_movielens_aggregated['batch_number_of_users']
itemknn_msd_aggregated['sample_efficiency'] = itemknn_msd_aggregated['batch_sample_count'] / itemknn_msd_aggregated['batch_number_of_users']

In [21]:
ease_netflix_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,40000,40000,3342978,12325370.0,3293481,0.985194,308.13425
GreedyBeamForwardMinimizer,40000,40000,3342978,1492617000.0,3334226,0.997382,37315.427325
GreedyForwardMinimizer,40000,40000,3342978,304761700.0,3337287,0.998298,7619.0415


In [22]:
ease_movielens_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,10000,10000,514250,1347993.0,507562,0.986995,134.7993
GreedyBeamForwardMinimizer,10000,10000,514250,127636918.0,512953,0.997478,12763.6918
GreedyForwardMinimizer,10000,10000,514250,26456234.0,513452,0.998448,2645.6234


In [23]:
ease_msd_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,50000,50000,2130589,6298211.0,2082983,0.977656,125.96422
GreedyBeamForwardMinimizer,50000,50000,2130589,312584766.0,2105183,0.988076,6251.69532
GreedyForwardMinimizer,50000,50000,2130589,66668997.0,2113275,0.991874,1333.37994


In [24]:
itemknn_netflix_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,40000,40000,3342978,57489980.0,3091596,0.924803,1437.249625
GreedyBeamForwardMinimizer,40000,40000,3342978,1492181000.0,3327659,0.995418,37304.528175
GreedyForwardMinimizer,40000,40000,3342978,304762300.0,3335064,0.997633,7619.058125


In [25]:
itemknn_movielens_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,10000,10000,514250,584393.0,513883,0.999286,58.4393
GreedyBeamForwardMinimizer,10000,10000,514250,127985428.0,514230,0.999961,12798.5428
GreedyForwardMinimizer,10000,10000,514250,26458436.0,514247,0.999994,2645.8436


In [26]:
itemknn_msd_aggregated

Unnamed: 0_level_0,batch_number_of_users,batch_number_of_users_processed,batch_original_number_of_input_interactions,batch_sample_count,batch_minimized_number_of_input_interactions,minimization_ratio,sample_efficiency
minimizer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GreedyBackwardMinimizer,50000,50000,2130589,19631891.0,1878915,0.881876,392.63782
GreedyBeamForwardMinimizer,50000,50000,2130589,312232567.0,2043142,0.958956,6244.65134
GreedyForwardMinimizer,50000,50000,2130589,66504832.0,2064522,0.968991,1330.09664
