In [1]:
import pickle
import numpy as np
import os
import json
from sklearn.metrics import f1_score, accuracy_score

# Reptile

Note that the code below implements loading of data from new experiments. We also provide pickled data from running our experiments -- loading of this data is at the end of this notebook.

In [3]:
model_name = 'Reptile'

In [None]:
DATA_PATH = os.path.join('..', '..', 'results', 'stability', 'cola')
FACTORS = os.path.join(DATA_PATH, 'factors', 'predictions')
GOLDEN = os.path.join(DATA_PATH, 'golden', 'predictions', 'golden_model')

In [5]:
results = []
failed = 0
all = 0 

for split in os.listdir(GOLDEN):
    split_path = os.path.join(GOLDEN, split)
    for label in os.listdir(split_path):
        label_path = os.path.join(split_path, label)
        if os.path.isdir(label_path):
            for run in os.listdir(label_path):
                run_path = os.path.join(label_path, run)
                if os.path.isdir(run_path):
                    for evaluation in os.listdir(os.path.join(run_path, model_name)):
                        evaluation_path = os.path.join(run_path, model_name, evaluation)
                        if os.path.isdir(evaluation_path):
                            with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                data = json.load(file)
                            score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                            results.append(score)
                            if score < 0.5:
                                failed += 1
                            all += 1

In [6]:
np.mean(results) * 100

57.167725115980986

In [7]:
np.std(results) * 100

10.501107030589822

In [8]:
print(f"Failed percentage of runs: {failed / all * 100}%")

Failed percentage of runs: 13.81%


## Load Data

#### Data Split

In [10]:
data_split_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_split')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                data_split_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    data_split_results['failed'] += 1
                                data_split_results['all'] += 1

In [11]:
len(data_split_results['results']), data_split_results['failed']

(10000, 1411)

#### Label Selection

In [12]:
label_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'label_selection')
                                
for label in os.listdir(factor_path):
    label_path = os.path.join(factor_path, label)
    if label.startswith('label_') and os.path.isdir(label_path):
        for split in os.listdir(label_path):
            split_path = os.path.join(label_path, split)
            if split.startswith('split_') and os.path.isdir(split_path):
                for run in os.listdir(split_path):
                    run_path = os.path.join(split_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                label_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    label_results['failed'] += 1
                                label_results['all'] += 1

In [13]:
len(label_results['results']), label_results['failed']

(10000, 1645)

#### Choice of Adaptation Data

In [14]:
adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    adaptation_results['failed'] += 1
                                adaptation_results['all'] += 1

In [15]:
len(adaptation_results['results']), adaptation_results['failed']

(10000, 9722)

##### Stable

In [16]:
stable_adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(DATA_PATH, 'adaptation_stable', 'predictions', 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    data = json.load(file)
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                stable_adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    stable_adaptation_results['failed'] += 1
                                stable_adaptation_results['all'] += 1

In [17]:
len(stable_adaptation_results['results']), stable_adaptation_results['failed']

(100000, 2687)

##### Unstable

In [18]:
unstable_adaptation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(DATA_PATH, 'adaptation_unstable', 'predictions', 'model_adaptation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        for evaluation in os.listdir(os.path.join(run_path, model_name)):
                            evaluation_path = os.path.join(run_path, model_name, evaluation)
                            if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                    try:
                                        data = json.load(file)
                                    except:
                                        continue
                                score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                split_number = int(split.split('_')[1])
                                label_number = int(label.split('_')[1])
                                run_number = int(run.split('_')[1])
                                adaptation_number = int(evaluation.split('_')[1])
                                
                                unstable_adaptation_results['results'].append({
                                    'score': score,
                                    'split': split_number,
                                    'label': label_number,
                                    'run': run_number,
                                    'adaptation': adaptation_number
                                })
                                if score < 0.5:
                                    unstable_adaptation_results['failed'] += 1
                                unstable_adaptation_results['all'] += 1

In [19]:
len(unstable_adaptation_results['results']), unstable_adaptation_results['failed']

(100000, 96152)

#### Initialisation of Model

In [20]:
initialisation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'model_initialisation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for initialisation in os.listdir(label_path):
                    initialisation_path = os.path.join(label_path, initialisation)
                    if initialisation.startswith('init_') and os.path.isdir(initialisation_path):
                        for run in os.listdir(initialisation_path):
                            run_path = os.path.join(initialisation_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                for evaluation in os.listdir(os.path.join(run_path, model_name)):
                                    evaluation_path = os.path.join(run_path, model_name, evaluation)
                                    if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                        with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                            data = json.load(file)
                                        score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                        split_number = int(split.split('_')[1])
                                        label_number = int(label.split('_')[1])
                                        run_number = int(run.split('_')[1])
                                        adaptation_number = int(evaluation.split('_')[1])
                                        initialisation_number = int(initialisation.split('_')[1])
                                
                                        initialisation_results['results'].append({
                                            'score': score,
                                            'split': split_number,
                                            'label': label_number,
                                            'run': run_number,
                                            'adaptation': adaptation_number,
                                            'initialisation': initialisation_number
                                        })
                                        if score < 0.5:
                                            initialisation_results['failed'] += 1
                                        initialisation_results['all'] += 1

In [21]:
len(initialisation_results['results']), initialisation_results['failed']

(20000, 2832)

#### Order of Train Data

In [22]:
order_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_order')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for order in os.listdir(label_path):
                    order_path = os.path.join(label_path, order)
                    if order.startswith('data_order_') and os.path.isdir(order_path):
                        for run in os.listdir(order_path):
                            run_path = os.path.join(order_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                for evaluation in os.listdir(os.path.join(run_path, model_name)):
                                    evaluation_path = os.path.join(run_path, model_name, evaluation)
                                    if evaluation.startswith('evaluation_') and os.path.isdir(evaluation_path):
                                        with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                            data = json.load(file)
                                        score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                        split_number = int(split.split('_')[1])
                                        label_number = int(label.split('_')[1])
                                        run_number = int(run.split('_')[1])
                                        adaptation_number = int(evaluation.split('_')[1])
                                        order_number = int(order.split('_')[2])
                                
                                        order_results['results'].append({
                                            'score': score,
                                            'split': split_number,
                                            'label': label_number,
                                            'run': run_number,
                                            'adaptation': adaptation_number,
                                            'order': order_number
                                        })
                                        if score < 0.5:
                                            order_results['failed'] += 1
                                        order_results['all'] += 1

In [23]:
len(order_results['results']), order_results['failed']

(20000, 1213)

## Compare Factors

### Aggregation by investigated factor

In this part we use the aggreagtion by the main investigated factor in following way:
- select runs where the value of factors only differ in the investigated factor (non-investigated factors have the same value; investigated has 10 values)
- calculate mean and standard deviation across the values of investigated factor
- results in ~10 000 values of mean and standard deviation
- calculate the final performance values as a mean of the pre-calculated many mean values
- calculate the instability of factor by calculating mean of the pre-calculated standard deviations

#### Data Split

In [45]:
data_split_by_other_factors = {}
overall_score = []

for result in data_split_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if data_split_by_other_factors.get(key, None) is None:
        data_split_by_other_factors[key] = [score]
    else:
        data_split_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.44589397698361, 10.240922173953663, 22.878970647366305, 66.11793179361038)

In [46]:
aggregated_data_split = {'mean': [], 'std': []}

for _, factor_value in data_split_by_other_factors.items():
    aggregated_data_split['mean'].append(np.mean(factor_value))
    aggregated_data_split['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_data_split['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_data_split['std'])}")
print(f"Other factors deviation: {np.std(aggregated_data_split['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_data_split['std'])}")

Investigated factor mean: 56.445893976983605
Investigated factor deviation: 8.549640070907888
Other factors deviation: 3.1749591603067815
Variability of factor deviation: 4.658301832351148


In [47]:
print(f"Failed percentage of runs: {data_split_results['failed'] / data_split_results['all'] * 100}%")

Failed percentage of runs: 14.11%


#### Label Selection

In [48]:
label_by_other_factors = {}
overall_score = []

for result in label_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"split_{result['split']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if label_by_other_factors.get(key, None) is None:
        label_by_other_factors[key] = [score]
    else:
        label_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.15618759447901, 11.076961711167472, 22.909967845659164, 65.78783450286805)

In [49]:
aggregated_label = {'mean': [], 'std': []}

for _, factor_value in label_by_other_factors.items():
    aggregated_label['mean'].append(np.mean(factor_value))
    aggregated_label['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_label['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_label['std'])}")
print(f"Other factors deviation: {np.std(aggregated_label['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_label['std'])}")

Investigated factor mean: 56.156187594479015
Investigated factor deviation: 9.482023299061868
Other factors deviation: 3.3975413408742807
Variability of factor deviation: 4.609449830919988


In [50]:
print(f"Failed percentage of runs: {label_results['failed'] / label_results['all'] * 100}%")

Failed percentage of runs: 16.45%


#### Choice of Adaptation Data

In [51]:
adaptation_by_other_factors = {}
overall_score = []

for result in adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if adaptation_by_other_factors.get(key, None) is None:
        adaptation_by_other_factors[key] = [score]
    else:
        adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(35.00167380894774, 9.790735846128984, 22.75473217881595, 53.73433918894539)

In [52]:
aggregated_adaptation = {'mean': [], 'std': []}

for _, factor_value in adaptation_by_other_factors.items():
    aggregated_adaptation['mean'].append(np.mean(factor_value))
    aggregated_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_adaptation['std'])}")

Investigated factor mean: 35.00167380894774
Investigated factor deviation: 9.036489861529297
Other factors deviation: 3.47799031879031
Variability of factor deviation: 1.4504974090132399


In [53]:
print(f"Failed percentage of runs: {adaptation_results['failed'] / adaptation_results['all'] * 100}%")

Failed percentage of runs: 97.22%


##### Stable

In [54]:
stable_adaptation_by_other_factors = {}
overall_score = []

for result in stable_adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if stable_adaptation_by_other_factors.get(key, None) is None:
        stable_adaptation_by_other_factors[key] = [score]
    else:
        stable_adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(60.00161994794071, 4.968874220054435, 22.878970647366305, 66.25381812949264)

In [55]:
aggregated_stable_adaptation = {'mean': [], 'std': []}

for _, factor_value in stable_adaptation_by_other_factors.items():
    aggregated_stable_adaptation['mean'].append(np.mean(factor_value))
    aggregated_stable_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_stable_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_stable_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_stable_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_stable_adaptation['std'])}")

Investigated factor mean: 60.001619947940696
Investigated factor deviation: 2.5103252701732757
Other factors deviation: 2.171166490481539
Variability of factor deviation: 3.6978391153836205


In [56]:
print(f"Failed percentage of runs: {stable_adaptation_results['failed'] / stable_adaptation_results['all'] * 100}%")

Failed percentage of runs: 2.6870000000000003%


##### Unstable

In [57]:
unstable_adaptation_by_other_factors = {}
overall_score = []

for result in unstable_adaptation_results['results']:
    # key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if unstable_adaptation_by_other_factors.get(key, None) is None:
        unstable_adaptation_by_other_factors[key] = [score]
    else:
        unstable_adaptation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(35.551894438690994, 10.012034385556646, 22.630092779346516, 55.59678884145763)

In [58]:
aggregated_unstable_adaptation = {'mean': [], 'std': []}

for _, factor_value in unstable_adaptation_by_other_factors.items():
    aggregated_unstable_adaptation['mean'].append(np.mean(factor_value))
    aggregated_unstable_adaptation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_unstable_adaptation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_unstable_adaptation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_unstable_adaptation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_unstable_adaptation['std'])}")

Investigated factor mean: 35.551894438691
Investigated factor deviation: 9.843380132014717
Other factors deviation: 1.7259314328018474
Variability of factor deviation: 0.6081618234427127


In [59]:
print(f"Failed percentage of runs: {unstable_adaptation_results['failed'] / unstable_adaptation_results['all'] * 100}%")

Failed percentage of runs: 96.152%


#### Initialisation of Model

In [60]:
initialisation_by_other_factors = {}
overall_score = []

for result in initialisation_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if initialisation_by_other_factors.get(key, None) is None:
        initialisation_by_other_factors[key] = [score]
    else:
        initialisation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.86936996662, 10.650003357050839, 22.909967845659164, 65.93044505307995)

In [61]:
aggregated_initialisation = {'mean': [], 'std': []}

for _, factor_value in initialisation_by_other_factors.items():
    aggregated_initialisation['mean'].append(np.mean(factor_value))
    aggregated_initialisation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_initialisation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_initialisation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_initialisation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_initialisation['std'])}")

Investigated factor mean: 56.86936996662
Investigated factor deviation: 8.325033983520006
Other factors deviation: 3.9786815455383167
Variability of factor deviation: 5.318502969598116


In [62]:
print(f"Failed percentage of runs: {initialisation_results['failed'] / initialisation_results['all'] * 100}%")

Failed percentage of runs: 14.16%


#### Order of Train Data

In [63]:
order_by_other_factors = {}
overall_score = []

for result in order_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}-adaptation_{result['adaptation']}"
    score = result['score'] * 100
    if order_by_other_factors.get(key, None) is None:
        order_by_other_factors[key] = [score]
    else:
        order_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(59.171245357227875, 7.33660323456349, 22.878970647366305, 66.49344340651919)

In [64]:
aggregated_order = {'mean': [], 'std': []}

for _, factor_value in order_by_other_factors.items():
    aggregated_order['mean'].append(np.mean(factor_value))
    aggregated_order['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_order['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_order['std'])}")
print(f"Other factors deviation: {np.std(aggregated_order['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_order['std'])}")

Investigated factor mean: 59.17124535722787
Investigated factor deviation: 4.689745835801505
Other factors deviation: 2.9592420180661727
Variability of factor deviation: 4.803635882901779


In [65]:
print(f"Failed percentage of runs: {order_results['failed'] / order_results['all'] * 100}%")

Failed percentage of runs: 6.065%


# Save Data

In [66]:
PICKLE_PATH = os.path.join('..', '..', 'pickle', 'cola')

In [67]:
with open(os.path.join(PICKLE_PATH, 'Reptile-data'), 'wb') as file:
    pickle.dump({
        'golden': results,
        'split': data_split_results,
        'label': label_results,
        'initialisation': initialisation_results,
        'order': order_results,
        'adaptation': adaptation_results,
        's_adaptation': stable_adaptation_results,
        'u_adaptation': unstable_adaptation_results,
    }, file)

In [None]:
with open(os.path.join(PICKLE_PATH, 'Reptile-data'), 'rb') as file:
    pickled = pickle.load(file)

results = pickled['golden']
data_split_results = pickled['split']
label_results = pickled['label']
initialisation_results = pickled['initialisation']
order_results = pickled['order']
adaptation_results = pickled['adaptation_results']
stable_adaptation_results = pickled['s_adaptation']
unstable_adaptation_results = pickled['u_adaptation']