In [63]:
import pickle
import numpy as np
import os
import json
from sklearn.metrics import f1_score, accuracy_score

# Reptile

Note that the code below implements loading of data from new experiments. We also provide pickled data from running our experiments -- loading of this data is at the end of this notebook.

In [64]:
model_name = 'Reptile'

In [65]:
DATA_PATH = os.path.join('..', '..', 'results', 'stability', 'mrpc')
FACTORS = os.path.join(DATA_PATH, 'factors', 'predictions')
GOLDEN = os.path.join(DATA_PATH, 'golden', 'predictions', 'golden_model')

In [8]:
results = []
failed = 0
all = 0 

for split in os.listdir(GOLDEN):
    split_path = os.path.join(GOLDEN, split)
    for label in os.listdir(split_path):
        label_path = os.path.join(split_path, label)
        if os.path.isdir(label_path):
            for run in os.listdir(label_path):
                run_path = os.path.join(label_path, run)
                if os.path.isdir(run_path):
                    for evaluation in os.listdir(os.path.join(run_path, model_name)):
                        evaluation_path = os.path.join(run_path, model_name, evaluation)
                        if os.path.isdir(evaluation_path):
                            with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                data = json.load(file)
                            score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                            results.append(score)
                            if score < 0.5:
                                failed += 1
                            all += 1

In [9]:
np.mean(results) * 100

61.05549650382373

In [10]:
np.std(results) * 100

5.698015853615155

In [12]:
print(f"Failed percentage of runs: {failed / all * 100}%")

Failed percentage of runs: 5.26%


## Load Data

#### Data Split

In [13]:
data_split_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_split')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                data_split_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    data_split_results['failed'] += 1
                                data_split_results['all'] += 1

In [14]:
len(data_split_results['results']), data_split_results['failed']

(10000, 572)

#### Label Selection

In [15]:
label_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'label_selection')
                                
for label in os.listdir(factor_path):
    label_path = os.path.join(factor_path, label)
    if label.startswith('label_') and os.path.isdir(label_path):
        for split in os.listdir(label_path):
            split_path = os.path.join(label_path, split)
            if split.startswith('split_') and os.path.isdir(split_path):
                for run in os.listdir(split_path):
                    run_path = os.path.join(split_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                label_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    label_results['failed'] += 1
                                label_results['all'] += 1

In [16]:
len(label_results['results']), label_results['failed']

(10000, 654)

#### Choice of Adaptation Data

In [None]:
adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    adaptation_results['failed'] += 1
                                adaptation_results['all'] += 1

In [None]:
len(adaptation_results['results']), adaptation_results['failed']

##### Stable

In [66]:
stable_adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(DATA_PATH, 'adaptation_stable', 'predictions', 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                stable_adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    stable_adaptation_results['failed'] += 1
                                stable_adaptation_results['all'] += 1

In [67]:
len(stable_adaptation_results['results']), stable_adaptation_results['failed']

(100000, 5635)

##### Unstable

In [68]:
unstable_adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(DATA_PATH, 'adaptation_unstable', 'predictions', 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    try:
                                        data = json.load(file)
                                    except:
                                        continue
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                unstable_adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    unstable_adaptation_results['failed'] += 1
                                unstable_adaptation_results['all'] += 1

In [69]:
len(unstable_adaptation_results['results']), unstable_adaptation_results['failed']

(100000, 83073)

#### Initialisation of Model

In [21]:
initialisation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'model_initialisation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for initialisation in os.listdir(label_path):
                    initialisation_path = os.path.join(label_path, initialisation)
                    if initialisation.startswith('init_') and os.path.isdir(initialisation_path):
                        for run in os.listdir(initialisation_path):
                            run_path = os.path.join(initialisation_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                for evaluation in os.listdir(os.path.join(run_path, model_name)):
                                    evaluation_path = os.path.join(run_path, model_name, evaluation)
                                    if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                        with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                            data = json.load(file)
                                        score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                        split_number = int(split.split('_')[1])
                                        label_number = int(label.split('_')[1])
                                        run_number = int(run.split('_')[1])
                                        adaptation_number = int(evaluation.split('_')[1])
                                        initialisation_number = int(initialisation.split('_')[1])
                                
                                        initialisation_results['results'].append({
                                            'score': score,
                                            'split': split_number,
                                            'label': label_number,
                                            'run': run_number,
                                            'adaptation': adaptation_number,
                                            'initialisation': initialisation_number
                                        })
                                        if score < 0.5:
                                            initialisation_results['failed'] += 1
                                        initialisation_results['all'] += 1

In [22]:
len(initialisation_results['results']), initialisation_results['failed']

(20000, 1413)

#### Order of Train Data

In [23]:
order_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_order')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for order in os.listdir(label_path):
                    order_path = os.path.join(label_path, order)
                    if order.startswith('data_order_') and os.path.isdir(order_path):
                        for run in os.listdir(order_path):
                            run_path = os.path.join(order_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                for evaluation in os.listdir(os.path.join(run_path, model_name)):
                                    evaluation_path = os.path.join(run_path, model_name, evaluation)
                                    if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                        with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                            data = json.load(file)
                                        score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                        split_number = int(split.split('_')[1])
                                        label_number = int(label.split('_')[1])
                                        run_number = int(run.split('_')[1])
                                        adaptation_number = int(evaluation.split('_')[1])
                                        order_number = int(order.split('_')[2])
                                
                                        order_results['results'].append({
                                            'score': score,
                                            'split': split_number,
                                            'label': label_number,
                                            'run': run_number,
                                            'adaptation': adaptation_number,
                                            'order': order_number
                                        })
                                        if score < 0.5:
                                            order_results['failed'] += 1
                                        order_results['all'] += 1

In [24]:
len(order_results['results']), order_results['failed']

(20000, 15664)

## Compare Factors

### Aggregation by investigated factor

In this part we use the aggreagtion by the main investigated factor in following way:
- select runs where the value of factors only differ in the investigated factor (non-investigated factors have the same value; investigated has 10 values)
- calculate mean and standard deviation across the values of investigated factor
- results in ~10 000 values of mean and standard deviation
- calculate the final performance values as a mean of the pre-calculated many mean values
- calculate the instability of factor by calculating mean of the pre-calculated standard deviations

#### Data Split

In [43]:
data_split_by_other_factors = {}
overall_score = []

for result in data_split_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if data_split_by_other_factors.get(key, None) is None:
        data_split_by_other_factors[key] = [score]
    else:
        data_split_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(60.54295859257066, 5.923228524932688, 24.675324675324678, 68.53550070004127)

In [44]:
aggregated_data_split = {'mean': [], 'std': []}

for _, factor_value in data_split_by_other_factors.items():
    aggregated_data_split['mean'].append(np.mean(factor_value))
    aggregated_data_split['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_data_split['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_data_split['std'])}")
print(f"Other factors deviation: {np.std(aggregated_data_split['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_data_split['std'])}")

Investigated factor mean: 60.54295859257067
Investigated factor deviation: 4.7404534104497
Other factors deviation: 1.776239138634637
Variability of factor deviation: 3.0753393543338534


In [45]:
print(f"Failed percentage of runs: {data_split_results['failed'] / data_split_results['all'] * 100}%")

Failed percentage of runs: 5.72%


#### Label Selection

In [46]:
label_by_other_factors = {}
overall_score = []

for result in label_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"split_{result['split']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if label_by_other_factors.get(key, None) is None:
        label_by_other_factors[key] = [score]
    else:
        label_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(60.30079517956267, 5.8942833690102665, 24.675324675324678, 66.98310018597881)

In [47]:
aggregated_label = {'mean': [], 'std': []}

for _, factor_value in label_by_other_factors.items():
    aggregated_label['mean'].append(np.mean(factor_value))
    aggregated_label['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_label['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_label['std'])}")
print(f"Other factors deviation: {np.std(aggregated_label['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_label['std'])}")

Investigated factor mean: 60.300795179562655
Investigated factor deviation: 4.744593395544843
Other factors deviation: 1.818779997891091
Variability of factor deviation: 2.9872142983747523


In [48]:
print(f"Failed percentage of runs: {label_results['failed'] / label_results['all'] * 100}%")

Failed percentage of runs: 6.54%


#### Choice of Adaptation Data

In [None]:
adaptation_by_other_factors = {}
overall_score = []

for result in adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if adaptation_by_other_factors.get(key, None) is None:
        adaptation_by_other_factors[key] = [score]
    else:
        adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

In [None]:
aggregated_adaptation = {'mean': [], 'std': []}

for _, factor_value in adaptation_by_other_factors.items():
    aggregated_adaptation['mean'].append(np.mean(factor_value))
    aggregated_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_adaptation['std'])}")

In [None]:
print(f"Failed percentage of runs: {adaptation_results['failed'] / adaptation_results['all'] * 100}%")

##### Stable

In [70]:
stable_adaptation_by_other_factors = {}
overall_score = []

for result in stable_adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if stable_adaptation_by_other_factors.get(key, None) is None:
        stable_adaptation_by_other_factors[key] = [score]
    else:
        stable_adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(60.77485535318761, 5.6262345428846965, 24.675324675324678, 68.40454931972789)

In [71]:
aggregated_stable_adaptation = {'mean': [], 'std': []}

for _, factor_value in stable_adaptation_by_other_factors.items():
    aggregated_stable_adaptation['mean'].append(np.mean(factor_value))
    aggregated_stable_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_stable_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_stable_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_stable_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_stable_adaptation['std'])}")

Investigated factor mean: 60.77485535318761
Investigated factor deviation: 4.220975512627844
Other factors deviation: 2.6119980766724096
Variability of factor deviation: 2.6486500147820053


In [72]:
print(f"Failed percentage of runs: {stable_adaptation_results['failed'] / stable_adaptation_results['all'] * 100}%")

Failed percentage of runs: 5.635%


##### Unstable

In [73]:
unstable_adaptation_by_other_factors = {}
overall_score = []

for result in unstable_adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if unstable_adaptation_by_other_factors.get(key, None) is None:
        unstable_adaptation_by_other_factors[key] = [score]
    else:
        unstable_adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(39.381341788386166, 11.119234588680872, 24.57737321196359, 67.53805520272586)

In [74]:
aggregated_unstable_adaptation = {'mean': [], 'std': []}

for _, factor_value in unstable_adaptation_by_other_factors.items():
    aggregated_unstable_adaptation['mean'].append(np.mean(factor_value))
    aggregated_unstable_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_unstable_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_unstable_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_unstable_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_unstable_adaptation['std'])}")

Investigated factor mean: 39.381341788386166
Investigated factor deviation: 10.99172784348871
Other factors deviation: 1.4265697685352652
Variability of factor deviation: 0.8855481626051485


In [75]:
print(f"Failed percentage of runs: {unstable_adaptation_results['failed'] / unstable_adaptation_results['all'] * 100}%")

Failed percentage of runs: 83.073%


#### Initialisation of Model

In [55]:
initialisation_by_other_factors = {}
overall_score = []

for result in initialisation_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if initialisation_by_other_factors.get(key, None) is None:
        initialisation_by_other_factors[key] = [score]
    else:
        initialisation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(60.037051839610186, 6.425176463538835, 24.675324675324678, 68.36340317004795)

In [56]:
aggregated_initialisation = {'mean': [], 'std': []}

for _, factor_value in initialisation_by_other_factors.items():
    aggregated_initialisation['mean'].append(np.mean(factor_value))
    aggregated_initialisation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_initialisation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_initialisation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_initialisation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_initialisation['std'])}")

Investigated factor mean: 60.037051839610186
Investigated factor deviation: 4.579839484694967
Other factors deviation: 2.690200158541821
Variability of factor deviation: 3.6153541996634186


In [57]:
print(f"Failed percentage of runs: {initialisation_results['failed'] / initialisation_results['all'] * 100}%")

Failed percentage of runs: 7.065%


#### Order of Train Data

In [58]:
order_by_other_factors = {}
overall_score = []

for result in order_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if order_by_other_factors.get(key, None) is None:
        order_by_other_factors[key] = [score]
    else:
        order_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(39.86991040381419, 12.331980466367302, 24.57737321196359, 68.62709634850819)

In [59]:
aggregated_order = {'mean': [], 'std': []}

for _, factor_value in order_by_other_factors.items():
    aggregated_order['mean'].append(np.mean(factor_value))
    aggregated_order['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_order['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_order['std'])}")
print(f"Other factors deviation: {np.std(aggregated_order['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_order['std'])}")

Investigated factor mean: 39.869910403814195
Investigated factor deviation: 11.445580227359514
Other factors deviation: 3.9917955156166793
Variability of factor deviation: 2.26759873951372


In [60]:
print(f"Failed percentage of runs: {order_results['failed'] / order_results['all'] * 100}%")

Failed percentage of runs: 78.32000000000001%


# Save Data

In [61]:
PICKLE_PATH = os.path.join('..', '..', 'pickled', 'mrpc')

In [62]:
with open(os.path.join(PICKLE_PATH, 'Reptile-data'), 'wb') as file:
    pickle.dump({
        'golden': results,
        'split': data_split_results,
        'label': label_results,
        'initialisation': initialisation_results,
        'order': order_results,
        'adaptation': adaptation_results,
        's_adaptation': stable_adaptation_results,
        'u_adaptation': unstable_adaptation_results,
    }, file)

In [None]:
with open(os.path.join(PICKLE_PATH, 'Reptile-data'), 'rb') as file:
    pickled = pickle.load(file)

results = pickled['golden']
data_split_results = pickled['split']
label_results = pickled['label']
initialisation_results = pickled['initialisation']
order_results = pickled['order']
adaptation_results = pickled['adaptation_results']
stable_adaptation_results = pickled['s_adaptation']
unstable_adaptation_results = pickled['u_adaptation']