In [None]:
import __init__

### Preprocessing
- Dataset: Materials Project structures (April 2025)
  - Columns: Database, Reduced Formula, CIF, Bandgap (eV), Energy Above Hull (eV)
- Deduplicated, filters applied: 
  - removed negative and null bandgaps
- Cleaned for CIF augmentation, normalisations:
  - Bandgap - power-log ($\beta = 0.8$)
  - $E_{hull}$ - min-max
- Note: I didnt filter to context length here because it was not implemented yet, but filter to context was flagged as True during model training which effectively does the same thing (less efficient)
- Pushed to Huggingface as c-bone/mpdb-2prop_clean

### Training

In [None]:
models = ['PKV', 'prepend', 'raw', 'slider']
regimes = ['ft', 'scratch']

for models in models:
    for regimes in regimes:
        config_path = f'_config_files/training/conditional/pretraining_benefits/mpdb_{regimes}-{models}.jsonc'

        !torchrun --nproc_per_node=2 _train.py --config config_path

### CIF Generation for each

In [None]:
for models in models:
    for regimes in regimes:
        gen_config_path = f'_config_files/generation/conditional/pretraining_benefits/mpdb_{regimes}-{models}.jsonc'

        !python _utils/_generating/generate_CIFs.py --config gen_config_path

### Metrics for each

In [None]:
for models in models:
    for regimes in regimes:
        gen_structs_path = f'_artifacts/pretrain_benefits/{regimes}-methods/mpdb_{regimes}-{models}_gen.parquet'

        postprocessed_path = f'_artifacts/pretrain_benefits/{regimes}-methods/mpdb-{regimes}-{models}_post.parquet'

        !python _utils/_metrics/VUN_metrics.py \
            --input_parquet gen_structs_path \
            --huggingface_dataset 'c-bone/mpdb-2prop_clean' \
            --load_processed_data 'HF-databases/mpdb-2prop_clean/mpdb_2prop_proc.parquet' \
            --output_parquet posprocessed_path \
            --num_workers 32

        !python _utils/_metrics/mace_ehull.py \
            --post_parquet posprocessed_path \
            --output_parquet posprocessed_path \
            --mp_data 'mp_computed_structure_entries.json.gz' \
            --num_workers 16

        !python _utils/_metrics/property_metrics.py \
            --post_parquet posprocessed_path \
            --output_parquet posprocessed_path \
            --property_targets ["Bandgap (eV)", "Energy Above Hull (eV)"] \
            --num_workers 16 \
            --property1_normaliser "power_log" \
            --property2_normaliser "linear" \
            --max_property1 17.891 \
            --min_property1 0.0 \
            --max_property2 5.418 \
            --min_property2 0.0

### results

In [None]:
import __init__
import pandas as pd
from datasets import load_dataset
from __scripts_in_dev._plotting import pretraining_benefits
from _utils import get_metrics_ptnd_vs_scratch

models = ['PKV', 'prepend', 'raw', 'slider']
regimes = ['ft', 'scratch']
dfs = {}
for models in models:
    for regime in regimes:
        if models == 'raw' and regime == 'ft':
            continue
        postprocessed_path = f'_artifacts/pretrain_benefits/{regime}-methods/mpdb-{regime}-{models}_post.parquet'
        # print(pd.read_parquet(postprocessed_path).columns)
        dfs[f'mpdb_{models}_{regime}'] = pd.read_parquet(postprocessed_path)

ds = load_dataset('c-bone/mpdb-2prop_clean', split='train')
train_df = ds.to_pandas()

In [None]:
metrics = get_metrics_ptnd_vs_scratch(
    dfs, 
    train_df=train_df,
    hit_tol_eV=0.5,
    )