In [1]:
import json
import textdistance
import pandas as pd

In [2]:
input_activities = ['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_ACCEPTED_COMPLETE', 'A_APPROVED_COMPLETE', 'A_REGISTERED_COMPLETE', 'A_ACTIVATED_COMPLETE', 'W_Valideren aanvraag_COMPLETE']
input_resources = ['<SOS>', '112', '112', '112', '11180', '11201', '11201', '11201', '11201', '11201', '11201', '11201', '11049', '11049', '10629', '10629', '10629', '10629', '10629']
input_amount = 15500.0

In [3]:
def pad_length(x, to_length, padding_value=None):
    return x + ([padding_value]* (to_length - len(x)))

def get_L2(x, y):
    max_length = max(len(x), len(y))
    padded_x = pad_length(x, max_length)
    padded_y = pad_length(y, max_length)
    assert(len(padded_x) == len(padded_y))

    distance = sum([0 if padded_x[idx] == padded_y[idx] else 1 for idx in range(len(padded_x))])

    return distance ** (1/2)


In [4]:
all_results = {}

In [5]:
all_ground_truth_vocabs = ["A_ACCEPTED_COMPLETE", "A_APPROVED_COMPLETE", "A_FINALIZED_COMPLETE"]
# ground_truth_vocab = "A_ACCEPTED_COMPLETE" # ["A_ACCEPTED_COMPLETE", "A_APPROVED_COMPLETE", "A_FINALIZED_COMPLETE", ]
for ground_truth_vocab in all_ground_truth_vocabs:
    idx = input_activities.index(ground_truth_vocab)
    milestone_activities = input_activities[:idx]
    milestone_resources = input_resources[:idx]
    milestone_amount = input_amount

    same_amount_file_name =  f'cf_{ground_truth_vocab}_Amount_{input_amount}_ReplaceAmount_None_result'
    amount_replace_file_name= f'cf_{ground_truth_vocab}_Amount_None_ReplaceAmount_{input_amount}_result'
    case_amonut_file_name = f'cf_{ground_truth_vocab}_Amount_None_ReplaceAmount_None_result'

    for file_name in [same_amount_file_name, amount_replace_file_name, case_amonut_file_name]:
        result_df = pd.read_csv(f'./cf_searching_result/{file_name}.csv')

        result_df['activity_sparcity'] = result_df['activity_vocab'].apply(lambda x: textdistance.levenshtein.distance(milestone_activities, json.loads(x.replace('\'', "\""))))
        result_df['resource_sparcity'] = result_df['resource_vocab'].apply(lambda x: textdistance.levenshtein.distance(milestone_resources, json.loads(x.replace('\'', "\""))))
        result_df['sparcity'] = result_df['activity_sparcity'] + result_df['resource_sparcity'] 

        result_df['activity_proximity'] = result_df['activity_vocab'].apply(lambda x: get_L2(milestone_activities,json.loads(x.replace('\'', "\""))))
        result_df['resource_proximity'] = result_df['resource_vocab'].apply(lambda x: get_L2(milestone_resources,json.loads(x.replace('\'', "\""))))
        result_df['proximity'] = ((result_df['activity_proximity']**2) + (result_df['resource_proximity']**2))**(1/2)

        result_df.to_csv(f'./cf_searching_eval/{file_name}_eval.csv')
        all_results[file_name] = result_df.copy(deep=True)

In [10]:
eval_cols = ['activity_proximity', 'resource_proximity', 'activity_sparcity', 'resource_sparcity']

In [11]:
eval_results = {}

for col in eval_cols:
    eval_results[col] = []

In [15]:
for k in all_results.keys():
    for col in eval_cols:
        eval_results[col] = eval_results[col] + all_results[k][col].tolist()

In [18]:
import numpy as np

In [20]:
for k in eval_results.keys():
    print(f"{k}: {np.mean(eval_results[k])}")

activity_proximity: 2.3531804328705026
resource_proximity: 2.656455064429225
activity_sparcity: 5.4492207139265965
resource_sparcity: 8.294746103569633
