In [None]:
import pickle
import numpy as np
import os
import json
from sklearn.metrics import f1_score, accuracy_score

# BERT

Note that the code below implements loading of data from new experiments. We also provide pickled data from running our experiments -- loading of this data is at the end of this notebook.

In [2]:
model_name = 'BERT'

In [3]:
DATA_PATH = os.path.join('..', '..', 'results', 'stability', 'cola')
FACTORS = os.path.join(DATA_PATH, 'factors', 'predictions')
GOLDEN = os.path.join(DATA_PATH, 'golden', 'predictions', 'golden_model')

In [4]:
results = []
failed = 0
all = 0 

for split in os.listdir(GOLDEN):
    split_path = os.path.join(GOLDEN, split)
    for label in os.listdir(split_path):
        label_path = os.path.join(split_path, label)
        if os.path.isdir(label_path):
            for run in os.listdir(label_path):
                run_path = os.path.join(label_path, run)
                if os.path.isdir(run_path):
                    evaluation_path = os.path.join(run_path, model_name)
                    if os.path.isdir(evaluation_path):
                        with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                            data = json.load(file)
                        score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                        results.append(score)
                        if score < 0.5:
                            failed += 1
                        all += 1

In [5]:
np.mean(results) * 100

56.38148511322326

In [6]:
np.std(results) * 100

3.8468625685824653

In [7]:
print(f"Failed percentage of runs: {failed / all * 100}%")

Failed percentage of runs: 6.1%


## Load Data

#### Data Split

In [8]:
data_split_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_split')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for run in os.listdir(label_path):
                    run_path = os.path.join(label_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        evaluation_path = os.path.join(run_path, model_name)
                        if os.path.isdir(evaluation_path):
                            with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                data = json.load(file)
                            score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                            split_number = int(split.split('_')[1])
                            label_number = int(label.split('_')[1])
                            run_number = int(run.split('_')[1])
                                
                            data_split_results['results'].append({
                                'score': score,
                                'split': split_number,
                                'label': label_number,
                                'run': run_number,
                            })
                            if score < 0.5:
                                data_split_results['failed'] += 1
                            data_split_results['all'] += 1

In [9]:
len(data_split_results['results']), data_split_results['failed']

(1000, 57)

#### Label Selection

In [10]:
label_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'label_selection')
                                
for label in os.listdir(factor_path):
    label_path = os.path.join(factor_path, label)
    if label.startswith('label_') and os.path.isdir(label_path):
        for split in os.listdir(label_path):
            split_path = os.path.join(label_path, split)
            if split.startswith('split_') and os.path.isdir(split_path):
                for run in os.listdir(split_path):
                    run_path = os.path.join(split_path, run)
                    if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                        evaluation_path = os.path.join(run_path, model_name)
                        if os.path.isdir(evaluation_path):
                            with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                data = json.load(file)
                            score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                            split_number = int(split.split('_')[1])
                            label_number = int(label.split('_')[1])
                            run_number = int(run.split('_')[1])
                                
                            label_results['results'].append({
                                'score': score,
                                'split': split_number,
                                'label': label_number,
                                'run': run_number,
                            })
                            if score < 0.5:
                                label_results['failed'] += 1
                            label_results['all'] += 1

In [11]:
len(label_results['results']), label_results['failed']

(1000, 60)

#### Initialisation of Model

In [12]:
initialisation_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'model_initialisation')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for initialisation in os.listdir(label_path):
                    initialisation_path = os.path.join(label_path, initialisation)
                    if initialisation.startswith('init_') and os.path.isdir(initialisation_path):
                        for run in os.listdir(initialisation_path):
                            run_path = os.path.join(initialisation_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                evaluation_path = os.path.join(run_path, model_name)
                                if os.path.isdir(evaluation_path):
                                    with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                        data = json.load(file)
                                    score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                    split_number = int(split.split('_')[1])
                                    label_number = int(label.split('_')[1])
                                    run_number = int(run.split('_')[1])
                                    initialisation_number = int(initialisation.split('_')[1])
                                
                                    initialisation_results['results'].append({
                                        'score': score,
                                        'split': split_number,
                                        'label': label_number,
                                        'run': run_number,
                                        'initialisation': initialisation_number
                                    })
                                    if score < 0.5:
                                        initialisation_results['failed'] += 1
                                    initialisation_results['all'] += 1

In [13]:
len(initialisation_results['results']), initialisation_results['failed']

(2000, 122)

#### Order of Train Data

In [14]:
order_results = {
    'results': [],
    'failed': 0,
    'all': 0
}

factor_path = os.path.join(FACTORS, 'data_order')

for split in os.listdir(factor_path):
    split_path = os.path.join(factor_path, split)
    if split.startswith('split_') and os.path.isdir(split_path):
        for label in os.listdir(split_path):
            label_path = os.path.join(split_path, label)
            if label.startswith('label_') and os.path.isdir(label_path):
                for order in os.listdir(label_path):
                    order_path = os.path.join(label_path, order)
                    if order.startswith('data_order_') and os.path.isdir(order_path):
                        for run in os.listdir(order_path):
                            run_path = os.path.join(order_path, run)
                            if run.startswith('run_') and os.path.isdir(run_path) and os.path.exists(os.path.join(run_path, model_name)):  
                                evaluation_path = os.path.join(run_path, model_name)
                                if os.path.isdir(evaluation_path):
                                    with open(os.path.join(evaluation_path, 'results.json'), 'r') as file:
                                        data = json.load(file)
                                    score = f1_score(np.array(data['predictions'][0]), np.array(data['predictions'][1]), average='macro')
                                
                                    split_number = int(split.split('_')[1])
                                    label_number = int(label.split('_')[1])
                                    run_number = int(run.split('_')[1])
                                    order_number = int(order.split('_')[2])
                                
                                    order_results['results'].append({
                                        'score': score,
                                        'split': split_number,
                                        'label': label_number,
                                        'run': run_number,
                                        'order': order_number
                                    })
                                    if score < 0.5:
                                        order_results['failed'] += 1
                                    order_results['all'] += 1

In [15]:
len(order_results['results']), order_results['failed']

(2000, 144)

## Compare Factors

### Aggregation by investigated factor

In this part we use the aggreagtion by the main investigated factor in following way:
- select runs where the value of factors only differ in the investigated factor (non-investigated factors have the same value; investigated has 10 values)
- calculate mean and standard deviation across the values of investigated factor
- results in ~10 000 values of mean and standard deviation
- calculate the final performance values as a mean of the pre-calculated many mean values
- calculate the instability of factor by calculating mean of the pre-calculated standard deviations

#### Data Split

In [28]:
data_split_by_other_factors = {}
overall_score = []

for result in data_split_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if data_split_by_other_factors.get(key, None) is None:
        data_split_by_other_factors[key] = [score]
    else:
        data_split_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(55.728850501178606, 3.552808446953908, 27.433191866430366, 62.958577790344236)

In [29]:
aggregated_data_split = {'mean': [], 'std': []}

for _, factor_value in data_split_by_other_factors.items():
    aggregated_data_split['mean'].append(np.mean(factor_value))
    aggregated_data_split['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_data_split['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_data_split['std'])}")
print(f"Other factors deviation: {np.std(aggregated_data_split['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_data_split['std'])}")

Investigated factor mean: 55.72885050117859
Investigated factor deviation: 3.170602059302921
Other factors deviation: 1.1329082627495337
Variability of factor deviation: 1.1341293182370993


In [30]:
print(f"Failed percentage of runs: {data_split_results['failed'] / data_split_results['all'] * 100}%")

Failed percentage of runs: 5.7%


#### Label Selection

In [31]:
label_by_other_factors = {}
overall_score = []

for result in label_results['results']:
    # key = f"split_{value['split']}-label_{value['label']}-run_{value['run']}-adaptation_{value['adaptation']}"
    key = f"split_{result['split']}-run_{result['run']}"
    score = result['score'] * 100
    if label_by_other_factors.get(key, None) is None:
        label_by_other_factors[key] = [score]
    else:
        label_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.318841551236794,
 3.7982238015478513,
 33.797222394757796,
 62.511866867705365)

In [32]:
aggregated_label = {'mean': [], 'std': []}

for _, factor_value in label_by_other_factors.items():
    aggregated_label['mean'].append(np.mean(factor_value))
    aggregated_label['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_label['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_label['std'])}")
print(f"Other factors deviation: {np.std(aggregated_label['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_label['std'])}")

Investigated factor mean: 56.318841551236794
Investigated factor deviation: 3.382363141616169
Other factors deviation: 1.184810707005022
Variability of factor deviation: 1.2579138338722868


In [33]:
print(f"Failed percentage of runs: {label_results['failed'] / label_results['all'] * 100}%")

Failed percentage of runs: 6.0%


#### Initialisation of Model

In [34]:
initialisation_by_other_factors = {}
overall_score = []

for result in initialisation_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if initialisation_by_other_factors.get(key, None) is None:
        initialisation_by_other_factors[key] = [score]
    else:
        initialisation_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.067599727931544,
 3.4047551740869646,
 40.103784378540944,
 62.878889497828894)

In [35]:
aggregated_initialisation = {'mean': [], 'std': []}

for _, factor_value in initialisation_by_other_factors.items():
    aggregated_initialisation['mean'].append(np.mean(factor_value))
    aggregated_initialisation['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_initialisation['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_initialisation['std'])}")
print(f"Other factors deviation: {np.std(aggregated_initialisation['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_initialisation['std'])}")

Investigated factor mean: 56.06759972793154
Investigated factor deviation: 2.9934868386748574
Other factors deviation: 1.4266268813224812
Variability of factor deviation: 0.7720946079597077


In [36]:
print(f"Failed percentage of runs: {initialisation_results['failed'] / initialisation_results['all'] * 100}%")

Failed percentage of runs: 6.1%


#### Order of Train Data

In [37]:
order_by_other_factors = {}
overall_score = []

for result in order_results['results']:
    key = f"split_{result['split']}-label_{result['label']}-run_{result['run']}"
    score = result['score'] * 100
    if order_by_other_factors.get(key, None) is None:
        order_by_other_factors[key] = [score]
    else:
        order_by_other_factors[key].append(score)
    overall_score.append(score)
np.mean(overall_score), np.std(overall_score), np.min(overall_score), np.max(overall_score)

(56.22319751226404, 3.862468695437335, 28.711898900907517, 63.07103411254219)

In [38]:
aggregated_order = {'mean': [], 'std': []}

for _, factor_value in order_by_other_factors.items():
    aggregated_order['mean'].append(np.mean(factor_value))
    aggregated_order['std'].append(np.std(factor_value))

print(f"Investigated factor mean: {np.mean(aggregated_order['mean'])}")
print(f"Investigated factor deviation: {np.mean(aggregated_order['std'])}")
print(f"Other factors deviation: {np.std(aggregated_order['mean'])}")
print(f"Variability of factor deviation: {np.std(aggregated_order['std'])}")

Investigated factor mean: 56.22319751226403
Investigated factor deviation: 3.345373621105372
Other factors deviation: 1.457876127224389
Variability of factor deviation: 1.2655974700175976


In [39]:
print(f"Failed percentage of runs: {order_results['failed'] / order_results['all'] * 100}%")

Failed percentage of runs: 7.199999999999999%


# Save Data and Load Pickle Data

In [40]:
PICKLE_PATH = os.path.join('..', '..', 'pickled', 'cola')

In [41]:
with open(os.path.join(PICKLE_PATH, 'BERT-data'), 'wb') as file:
    pickle.dump({
        'golden': results,
        'split': data_split_results,
        'label': label_results,
        'initialisation': initialisation_results,
        'order': order_results,
    }, file)

In [None]:
with open(os.path.join(PICKLE_PATH, 'BERT-data'), 'rb') as file:
    pickled = pickle.load(file)

results = pickled['golden']
data_split_results = pickled['split']
label_results = pickled['label']
initialisation_results = pickled['initialisation']
order_results = pickled['order']