In [1]:
import __init__

Navigated to package root: /home/cyprien/CrystaLLMv2_PKV
Added package root to Python path


#### 1st pass finetune - Mattergen XRD
- Dataset Source: [Mattergen Alex-MP-20](https://github.com/microsoft/mattergen/tree/main/data-release/alex-mp)
  - Columns: Database (manual) 
  - Reduced Formula (Source)
  - CIF (pmg - Cifwriter with symprec 0.1)
  - XRD 'Condition Vector' (with [_calculate_XRD.py](_utils/_preprocessing/_calculate_XRD.py))
    - pmg - XRDCalculator(wavelength="CuKa")
    - top 20 most intense peaks selected ($2\theta$ and int)
    - Normalisations
      - $2\theta$ min-max for 0,90
      - intensities min-max for 0,100
- Deduplicated
- Cleaned for CIF augmentation
  -  Note: I didnt filter to context length here because it was not implemented yet, but filter to context was flagged as True during model training which effectively does the same thing (less efficient)
- dataset pushed to HuggingFace as: c-bone/mattergen_XRD (90:10 train/valid sets)

In [None]:
!torchrun --nproc_per_node=2 _train.py --config '_config_files/training/conditional/xrd_studies/mattergen_XRD-slider.jsonc'

#### 2nd pass finetune - COD XRD
- Dataset Source: [COD hkl data](https://www.crystallography.net/hkl/)
  - Columns: Database (manual) 
  - Reduced Formula (automated extraction from source)
  - CIF
    - automated extraction of material id from COD source
    - converted to structure using pmg COD.get_structure_by_id
    - Cifwriter with symprec 0.1 for CIF
    - note: this was done because alot of COD cifs arent in clean standard format. Pymatgen already did a big job of cleaning them up so we dont need to reinvent the wheel and take CIF data straight from source.
  - XRD data
    - For every Material ID that has experimental hkl data and associated intensities, we extract it
    - Then:
      1. Calculate d_hkl from crystal structure.lattice.d_hkl([h, k, l])
      2. Use Bragg's law: sin($\theta$) = $\lambda$/($2$ × d_hkl)
      3. Find $\theta$ = arcsin($\lambda$/($2$ × d_hkl))
      4. Convert to degrees: $2\theta$ = $2$ × $\theta$ × (180/$\pi$)
    - Where:
      - $\lambda$: X-ray wavelength ($1.5406$ $\AA$ for Cu K$\alpha$)
      - d_hkl: d-spacing for the (hkl) planes
      - $\theta$: Bragg angle
    - Created 'Condition Vector'
      - top 20 most intense peaks selected ($2\theta$ and int)
      - Normalisations
        - $2\theta$ min-max for 0,90
        - intensities min-max for 0,100
  - Filtered out all hydrocarbons
    - symbols = struct.symbol_set
    - if "C" in symbols and "H" in symbols, remove it
  - Then cleaning for CIF augmentation
    - set --make_disordered_ordered flag
      - Makes every occupancy exactly integer if occupancy is int $\pm 0.05$. Element set needs to be exactly preserved or structure discarded.
    - Filtered to 1024 contect length
  - Pushed to HuggingFace as c-bone/COD_XRD_small_nohc

### Training
> **Note**: Here the hyperparamters change compared to regular finetuning because its 2nd pass. Backbone learning rates were set to decay from $5\times10^{-8}$ to $5\times10^{-10}$, while the learning rates for the newly initialised conditioning parameters were set 100 times higher

In [None]:
!python _utils/_preprocessing/_save_dataset_to_HF.py \
    --input_parquet 'HF-databases/COD_dev/COD_xrd_clean_nohc_small.parquet' \
    --output_parquet 'HF-databases/COD_XRD_small_nohc_full.parquet' \
    --valid_size 0.000 \
    --test_size 0.125 \
    --save_hub

In [None]:
!torchrun --nproc_per_node=2 _train.py --config '_config_files/training/conditional/xrd_studies/COD_XRD_small-slider-opt.jsonc'

In [None]:
# Create reference file from test split
import pandas as pd
df = pd.read_parquet('HF-databases/COD_XRD_small_nohc_full.parquet')
df_test = df[df['Split'] == 'test'].copy()
df_test.to_parquet('_artifacts/cod-xrd/cod-test_ref.parquet', index=False)

### Generating

In [None]:
!python _utils/_generating/make_prompts.py \
    --HF_dataset 'c-bone/COD_XRD_small_nohc' \
    --split 'test' \
    --automatic \
    --output_parquet '_artifacts/cod-xrd/cod-test_prompts.parquet' \
    --level 'level_3' \
    --condition_columns 'Condition Vector'

#### Generate materials using 2-pass finetuning and XRD information (Repeated 3x)

In [None]:
import __init__

Navigated to package root: /home/cyprien/CrystaLLMv2_PKV
Added package root to Python path


In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/xrd_studies/cod-xrd_eval.jsonc'

python: can't open file '/home/cyprien/CrystaLLMv2_PKV/notebooks/_utils/_generating/generate_CIFs.py': [Errno 2] No such file or directory


In [None]:
!python _utils/_generating/postprocess.py \
    --input_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_gen.parquet' \
    --output_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_post.parquet' \
    --num_workers 32 \
    --column_name 'Generated CIF'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_post.parquet' \
    --num_gens 20 \
    --ref_parquet '_artifacts/cod-xrd/cod-test_ref.parquet' \
    --output_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_metrics.parquet' \
    --num_workers 32 \
    --validity_check "none"

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/cod-xrd/cod-test_ref.parquet' \
    --output_parquet '_artifacts/cod-xrd/cod-ft-1perp-test_metrics.parquet' \
    --num_workers 32 \
    --validity_check "none"

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/cod-xrd/cod-ft-20perp-test_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/cod-xrd/cod-test_ref.parquet' \
    --output_parquet '_artifacts/cod-xrd/cod-ft-1rand-test_metrics.parquet' \
    --num_workers 32 \
    --validity_check "none"\
    --sort_gens "random"

#### Generate materials using 2-pass finetuning (Mattergen XRD + COD XRD nohc) but no XRD information fed during inference (repeated 3x)

> **Note**: replaced the condition_vector column in the prompt df made above with a series of [-100] missing values meaning no XRD information is fed during generation

In [10]:
import __init__

In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/xrd_studies/cod-xrd-uncond_eval.jsonc'

In [None]:
!python _utils/_generating/postprocess.py \
    --input_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_gen.parquet' \
    --output_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_post.parquet' \
    --num_workers 32 \
    --column_name 'Generated CIF'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_post.parquet' \
    --num_gens 20 \
    --ref_parquet '_artifacts/cod-xrd/perp-repeats/cod-test_ref.parquet' \
    --output_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_metrics.parquet' \
    --num_workers 32 \
    --validity_check "none"

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/cod-xrd/perp-repeats/cod-test_ref.parquet' \
    --output_parquet '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-1perp-test_metrics.parquet' \
    --num_workers 32 \
    --validity_check "none"

In [4]:
metrics_1perp_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-1perp-test_metrics.parquet'
metrics_1perp_v2_parquet= '_artifacts/cod-xrd/perp-repeats/cod-ft-1perp-v2-test_metrics.parquet'
metrics_1perp_v3_parquet= '_artifacts/cod-xrd/perp-repeats/cod-ft-1perp-v3-test_metrics.parquet'
metrics_20perp_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-20perp-test_metrics.parquet'
metrics_20perp_v2_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-20perp-v2-test_metrics.parquet'
metrics_20perp_v3_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-20perp-v3-test_metrics.parquet'
metrics_uncond_1perp_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-1perp-test_metrics.parquet'
metrics_uncond_1perp_v2_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-1perp-v2-test_metrics.parquet'
metrics_uncond_1perp_v3_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-1perp-v3-test_metrics.parquet'
metrics_uncond_20perp_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-test_metrics.parquet'
metrics_uncond_20perp_v2_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-v2-test_metrics.parquet'
metrics_uncond_20perp_v3_parquet = '_artifacts/cod-xrd/perp-repeats/cod-ft-uncond-20perp-v3-test_metrics.parquet'


# make a table with all the results
import __init__
from _utils import get_metrics_xrd
import pandas as pd
import numpy as np

paths = {
    'cond-20perp': metrics_20perp_parquet,
    'cond-20perp-v2': metrics_20perp_v2_parquet,
    'cond-20perp-v3': metrics_20perp_v3_parquet,
    'cond-1perp': metrics_1perp_parquet,
    'cond-1perp-v2': metrics_1perp_v2_parquet,
    'cond-1perp-v3': metrics_1perp_v3_parquet,
    'uncond-20perp': metrics_uncond_20perp_parquet,
    'uncond-20perp-v2': metrics_uncond_20perp_v2_parquet,
    'uncond-20perp-v3': metrics_uncond_20perp_v3_parquet,
    'uncond-1perp': metrics_uncond_1perp_parquet,
    'uncond-1perp-v2': metrics_uncond_1perp_v2_parquet,
    'uncond-1perp-v3': metrics_uncond_1perp_v3_parquet
}
results = {}

for names, path in paths.items():
    df = pd.read_parquet(path)
    metrics_result = get_metrics_xrd(df, n_test=198, only_matched=False, verbose=False)
    results[names] = metrics_result
        
# Create final table with all results
final_table = pd.DataFrame.from_dict(results, orient='index')
final_table.to_parquet('_artifacts/cod-xrd/cod-ft-vs-uncond-all-results.parquet')

In [1]:
import __init__
import pandas as pd
import numpy as np

final_table = pd.read_parquet('_artifacts/cod-xrd/cod-ft-vs-uncond-all-results.parquet')

base_conditions = {}
for index in final_table.index:
    if '-v2' in index:
        base_name = index.replace('-v2', '')
    elif '-v3' in index:
        base_name = index.replace('-v3', '')
    else:
        base_name = index
    
    if base_name not in base_conditions:
        base_conditions[base_name] = []
    base_conditions[base_name].append(index)

# averaged results with standard error between the 3 runs
averaged_results = {}
stderr_results = {}
for base_name, variants in base_conditions.items():
    variant_data = []
    for variant in variants:
        if variant in final_table.index:
            variant_data.append(final_table.loc[variant])
    
    if variant_data:
        data_df = pd.concat(variant_data, axis=1)
        averaged_results[base_name] = data_df.mean(axis=1)
        stderr_results[base_name] = data_df.std(axis=1) / np.sqrt(len(variant_data))

averaged_table = pd.DataFrame.from_dict(averaged_results, orient='index')
stderr_table = pd.DataFrame.from_dict(stderr_results, orient='index')

formatted_table = pd.DataFrame(index=averaged_table.index, columns=averaged_table.columns, dtype=object)
for col in averaged_table.columns:
    for idx in averaged_table.index:
        mean_val = averaged_table.loc[idx, col]
        stderr_val = stderr_table.loc[idx, col]
        if pd.isna(stderr_val) or stderr_val == 0:
            formatted_table.loc[idx, col] = f"{mean_val:.3f}"
        else:
            formatted_table.loc[idx, col] = f"{mean_val:.3f} (±{stderr_val:.3f})"

formatted_table

Navigated to package root: /home/cyprien/CrystaLLMv2_PKV
Added package root to Python path


Unnamed: 0,Number of matched structures,Total number of structures,Mean RMS-d,Percent Matched (%),a MAE,b MAE,c MAE,Volume MAE,a R^2,b R^2,c R^2,Volume R^2,Average Score
cond-20perp,88.667 (±1.453),198.0,0.103 (±0.003),44.781 (±0.734),0.843 (±0.023),0.653 (±0.023),0.926 (±0.034),49.593 (±2.497),0.875 (±0.005),0.898 (±0.008),0.933 (±0.003),0.981 (±0.001),1.272 (±0.002)
cond-1perp,60.667 (±0.333),198.0,0.063 (±0.007),30.640 (±0.168),2.057 (±0.042),1.956 (±0.022),2.748 (±0.087),80.797 (±3.944),0.565 (±0.023),0.462 (±0.014),0.507 (±0.039),0.951 (±0.003),1.255 (±0.001)
uncond-20perp,83.667 (±1.202),198.0,0.100 (±0.006),42.256 (±0.607),1.071 (±0.006),0.815 (±0.036),1.322 (±0.052),61.243 (±2.620),0.834 (±0.005),0.872 (±0.005),0.835 (±0.029),0.976 (±0.002),1.235 (±0.001)
uncond-1perp,56.333 (±1.453),198.0,0.072 (±0.002),28.451 (±0.734),2.349 (±0.058),2.076 (±0.050),3.117 (±0.033),92.954 (±6.435),0.491 (±0.024),0.462 (±0.012),0.479 (±0.027),0.949 (±0.006),1.219 (±0.001)


> Note: the results here differ very slightly from the plots in the paper, because those plots calculate R^2 and MAE over all of the rows in the 3 runs (concatenated results), whereas here its the average between the results of each run as a whole and the associated stderr

### Testing on some real data

- Had the chance to get given some XRD data calculated by a group. It was calculated for brookite, anatase and rutiile poolymorphs of TiO2
- Anatase and rutile were seen during training and finetuning (in pretrain data and the mattergen xrd 1st pass finetune dataset), brookite was not
- Can the model generate the correct structures for experimental XRDs for materials seen in training, and one unseen?

1. First we make a dataset with the true structures as per their materials project structures
2. To this we add a prompt for each of the structures
3. And a condition vector as per below

In [None]:
import __init__
from _utils import process_xrd_to_condition_vector

# anatase_raw_data = """2θ [°] Cu	Intensity
# 25.2280719392351	281.55012
# 30.7984477760649	148.62471
# 36.4922566866002	122.62704
# 37.6908921523268	119.93007
# 41.9139352787292	114.27506
# 48.0759377770583	93.23776
# 55.0743148175043	135.06294
# 62.592362748181	    81.58042
# 65.9512366402823	78.86014
# 77.6190112038205	75.32634"""

# rutile_raw_data = """2θ [°] Cu	Intensity
# 23.4685203891387	203.0
# 27.4456323189006	922.0
# 30.8154418109335	163.0
# 36.1036065436626	473.0
# 39.224627779168	    112.0
# 41.2593176108602	270.0
# 44.0079436046877	133.0
# 46.2453487299531	109.0
# 54.3526968350901	450.0
# 56.6773553113041	186.0
# 62.8816130603703	127.0
# 64.1161640720417	117.0
# 69.0374099680916	164.0
# 69.9017250464323	127.0
# 82.4052975899525	90.0"""

# brookite_raw_data = """2θ [°] Cu	Intensity
# 21.7068951575405	197.0
# 25.4158554191106	615.0
# 30.8494290781386	362.0
# 36.357093687628	    159.0
# 37.387173876481	    130.0
# 40.2006181429811	139.0
# 42.4506080337455	123.0
# 46.2786823244331	116.0
# 48.1590101936239	180.0
# 49.2875816475943	124.0
# 54.3691099189334	142.0
# 55.3364355858856	166.0
# 57.3462424126941	95.0
# 62.2385043787467	93.0
# 63.7639034593297	107.0
# 65.1227905836474	107.0
# 68.8799626318172	80.0
# 84.4701610611719	103.0"""

anatase_raw_data = """2θ [°] Cu	Intensity
25.281 100.0
36.947 10.0
37.801 20.0
38.576 10.0
48.050 35.0
53.891 20.0
55.062 20.0
62.121 4.0
62.690 14.0
68.762 6.0
70.311 6.0
74.031 2.0
75.032 10.0
76.020 4.0
80.727 2.0
82.139 2.0
82.662 6.0
83.149 4.0
83.221 2.0
"""

rutile_raw_data = """2θ [°] Cu	Intensity
27.447 100.0
36.086 50.0
39.188 8.0
41.226 25.0
44.052 10.0
54.323 60.0
56.642 20.0
62.742 10.0
64.040 10.0
65.480 2.0
69.010 20.0
69.790 12.0
72.410 2.0
74.411 1.0
76.510 4.0
79.822 2.0
82.335 6.0
84.260 4.0
87.464 2.0
89.557 8.0"""

brookite_raw_data = """2θ [°] Cu	Intensity
25.35 100.0
25.69 77.71
30.81 97.23
36.24 23.86
37.28 17.77
37.96 7.41
40.16 14.94
42.33 15.5
46.07 19.95
48.03 32.31
49.16 20.2
54.22 22.63
55.24 31.92
57.14 15.52
62.08 11.59
63.41 7.68
63.69 10.48
64.98 13.99
65.94 8.26
70.44 9.04
"""


# Test the function
anatase = process_xrd_to_condition_vector(anatase_raw_data)
rutile = process_xrd_to_condition_vector(rutile_raw_data)
brookite = process_xrd_to_condition_vector(brookite_raw_data)

print(anatase)
print(rutile)
print(brookite)


Navigated to package root: /home/cyprien/CrystaLLMv2_PKV
Added package root to Python path
Theta scaled to [0,1] (0 to 90), Intensity scaled to [0,1] (relative to max in pattern), -100 for padding
Theta scaled to [0,1] (0 to 90), Intensity scaled to [0,1] (relative to max in pattern), -100 for padding
Theta scaled to [0,1] (0 to 90), Intensity scaled to [0,1] (relative to max in pattern), -100 for padding
0.281,0.534,0.42,0.599,0.612,0.697,0.411,0.429,0.834,0.764,0.781,0.918,0.69,0.845,0.924,0.823,0.897,0.913,0.925,-100,1.0,0.35,0.2,0.2,0.2,0.14,0.1,0.1,0.1,0.06,0.06,0.06,0.04,0.04,0.04,0.02,0.02,0.02,0.02,-100
0.305,0.604,0.401,0.458,0.629,0.767,0.775,0.489,0.697,0.712,0.435,0.995,0.915,0.85,0.936,0.728,0.805,0.887,0.972,0.827,1.0,0.6,0.5,0.25,0.2,0.2,0.12,0.1,0.1,0.1,0.08,0.08,0.06,0.04,0.04,0.02,0.02,0.02,0.02,0.01
0.282,0.342,0.285,0.534,0.614,0.403,0.602,0.546,0.512,0.414,0.635,0.47,0.446,0.722,0.69,0.708,0.783,0.733,0.705,0.422,1.0,0.972,0.777,0.323,0.319,0.239,0.226,0.202,0.199,

In [None]:
import pandas as pd
df_path = '_artifacts/cod-xrd/amil/amil-TiO2-nosc-nosg_ref_prompts.parquet'

df = pd.read_parquet(df_path)
df

Unnamed: 0,MP_ehull,CIF,is_in_train,Material ID,name,Prompt,condition_vector,Supercell,Spacegroup
0,0.043639,# generated using pymatgen\ndata_TiO2\n_symmet...,True,mp-2657,Rutile (P4_2/mnm),<bos>\ndata_[Ti2O4]\nloop_\n _atom_type_symbol...,"0.305,0.604,0.401,0.458,0.629,0.767,0.775,0.48...",False,False
1,0.0,# generated using pymatgen\ndata_TiO2\n_symmet...,True,mp-390,Anatase (I4_1/amd),<bos>\ndata_[Ti4O8]\nloop_\n _atom_type_symbol...,"0.281,0.534,0.42,0.599,0.612,0.697,0.411,0.429...",False,False
2,0.003041,# generated using pymatgen\ndata_TiO2\n_symmet...,False,mp-1840,Brookite (Pbca),<bos>\ndata_[Ti8O16]\nloop_\n _atom_type_symbo...,"0.282,0.342,0.285,0.534,0.614,0.403,0.602,0.54...",False,False


> Note: I added the condition vector to a dataframe where I stored prompts,true CIF of material, and some information for the experiment. We can use this as both prompt and ref dfs because it contains all the relevant columns

Here we want to probe a few things, can we recover structues from XRD data sent to us by a group we are collaborating with? for 3 different XRD polymorphs. 2 of them were seen in training sets, though neither were in the experimental finetuning set (rutile, anatase), and one of them hasnt been seen at all (training or finetuning).

We want to see if we can recover the structures from various possible experimental (their composition), sometimes we have their spacegroup (comp + spacegroup), sometimes we dont really know the composition, so is the model able to match to the material when prompted with the composition of its supercell?

In [21]:
import pandas as pd
import __init__

In [34]:
# Loop through each of the 4 split files
prompt_files = [
    'amil-TiO2-nosc-nosg_ref_prompts.parquet',
    'amil-TiO2-sc-nosg_ref_prompts.parquet', 
    'amil-TiO2-nosc-sg_ref_prompts.parquet',
    'amil-TiO2-sc-sg_ref_prompts.parquet'
]

config_files = [
    '_config_files/generation/conditional/xrd_studies/amil/cod-amil-xrd-nosc-nosg_eval.jsonc',
    '_config_files/generation/conditional/xrd_studies/amil/cod-amil-xrd-sc-nosg_eval.jsonc',
    '_config_files/generation/conditional/xrd_studies/amil/cod-amil-xrd-nosc-sg_eval.jsonc', 
    '_config_files/generation/conditional/xrd_studies/amil/cod-amil-xrd-sc-sg_eval.jsonc'
]

output_gen_files = [
    'amil-TiO2-nosc-nosg_gen.parquet',
    'amil-TiO2-sc-nosg_gen.parquet',
    'amil-TiO2-nosc-sg_gen.parquet',
    'amil-TiO2-sc-sg_gen.parquet'
]
output_metrics_files = [
    'amil-TiO2-nosc-nosg_metrics.parquet',
    'amil-TiO2-sc-nosg_metrics.parquet',
    'amil-TiO2-nosc-sg_metrics.parquet',
    'amil-TiO2-sc-sg_metrics.parquet'
]

In [35]:

for i, (prompt_file, config_file, output_file) in enumerate(zip(prompt_files, config_files, output_gen_files)):
    print(f"\nProcessing {i+1}/4: {prompt_file}")
    
    # Generate CIFs
    !python _utils/_generating/generate_CIFs.py --config '{config_file}'
    
    # Postprocess
    !python _utils/_generating/postprocess.py \
        --input_parquet '_artifacts/cod-xrd/amil/{output_file}' \
        --output_parquet '_artifacts/cod-xrd/amil/{output_file}' \
        --num_workers 32 \
        --column_name 'Generated CIF'

print("\nGeneration and postprocessing complete for all files!")


Processing 1/4: amil-TiO2-nosc-nosg_ref_prompts.parquet
Environment info
Available GPUs: 2
GPU 0: NVIDIA L4
GPU 1: NVIDIA L4

Generation settings
Total sequences per prompt-condition pair: 20
Will save generated CIFs to _artifacts/cod-xrd/amil/amil-TiO2-nosc-nosg_gen.parquet
Model's max_length: 1024
Tokenizer validation passed: token vocabulary is consistent.
Generation kwargs: {'max_length': 1024, 'pad_token_id': 371, 'eos_token_id': 373, 'renormalize_logits': True, 'remove_invalid_values': True, 'num_return_sequences': 20, 'do_sample': True, 'top_k': 10, 'top_p': 0.95, 'temperature': 0.75}

Generation Strategy
Number of condition-prompt pairs: 3
Target valid CIFs per prompt: 20
Will save all CIFs ranked by LOGP score (up to 20 per prompt)
Tokenizer validation passed: token vocabulary is consistent.
Generating CIFs...:   0%|                                | 0/60 [00:00<?, ?it/s]Tokenizer validation passed: token vocabulary is consistent.
Tokenizer validation passed: token vocabulary 

In [36]:
print("Running XRD metrics for all 4 generated files...")


for i, (gen_file, prompt_file, metrics_file) in enumerate(zip(output_gen_files, prompt_files, output_metrics_files)):
    print(f"\nCalculating metrics for {i+1}/4: {gen_file}")
    
    !python _utils/_metrics/XRD_metrics.py \
        --input_parquet '_artifacts/cod-xrd/amil/{gen_file}' \
        --num_gens 20 \
        --ref_parquet '_artifacts/cod-xrd/amil/{prompt_file}' \
        --output_parquet '_artifacts/cod-xrd/amil/{metrics_file}' \
        --num_workers 4 \
        --validity_check "none"

print("\nXRD metrics calculation complete for all files!")

Running XRD metrics for all 4 generated files...

Calculating metrics for 1/4: amil-TiO2-nosc-nosg_gen.parquet


Using 20 generation(s) per compound
Using 32 workers for parallel processing (based on input size)
Loaded 3 materials from _artifacts/cod-xrd/amil/amil-TiO2-nosc-nosg_gen.parquet
Using 3 matched materials from test DB
Parsing true CIFs: 100%|█████████████████████████| 3/3 [00:00<00:00, 318.97it/s]
Processing 60 CIFs across 3 materials
Parsing and sensible check for gen CIFs: 100%|█| 60/60 [00:00<00:00, 276.60it/s]
Materials processed: 3
Materials with sensible structures: 3
Comparing structures: 100%|███████████████████████| 3/3 [00:00<00:00,  5.67it/s]

Results saved to: _artifacts/cod-xrd/amil/amil-TiO2-nosc-nosg_metrics.parquet

Metrics:
  match_rate: 1.0000
  rms_dist: 0.1711
  n_matched: 3.0000
  a_diff: 0.1542
  b_diff: 1.4443
  c_diff: 1.3934

Calculating metrics for 2/4: amil-TiO2-sc-nosg_gen.parquet
Using 20 generation(s) per compound
Using 32 workers for parallel processing (based on input size)
Loaded 3 materials from _artifacts/cod-xrd/amil/amil-TiO2-sc-nosg_gen.parquet
Usi

In [37]:
# for all the 4 files, make a table with all the results
import pandas as pd
import __init__
from _utils import get_metrics_xrd

output_metrics_files = [
    'amil-TiO2-nosc-nosg_metrics.parquet',
    'amil-TiO2-sc-nosg_metrics.parquet',
    'amil-TiO2-nosc-sg_metrics.parquet',
    'amil-TiO2-sc-sg_metrics.parquet'
]

# concatenate all the metrics files into one dataframe
all_metrics = pd.concat([pd.read_parquet(f'_artifacts/cod-xrd/amil/{f}') for f in output_metrics_files], ignore_index=True)
all_metrics


  all_metrics = pd.concat([pd.read_parquet(f'_artifacts/cod-xrd/amil/{f}') for f in output_metrics_files], ignore_index=True)


Unnamed: 0,True Struct,Gen Struct,RMS-d,True a,True b,True c,True volume,Gen a,Gen b,Gen c,Gen volume,Sensible Num,Score,n_matched_struct
0,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,0.484867,5.149119,9.194203,5.459122,258.446023,5.53,4.9431,9.5662,225.626419,20,1.539175,2
1,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,5.2e-05,4.599837,4.599837,2.959214,62.61253,4.5405,4.5405,2.9388,60.586713,20,1.101199,15
2,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,0.028477,3.78254,3.78254,9.615022,137.567944,3.7602,3.7602,9.6678,136.69403,20,1.117505,1
3,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,,5.149119,9.194203,5.459122,258.446023,7.784,10.0451,7.755,605.559919,20,1.315744,0
4,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,,4.599837,4.599837,2.959214,62.61253,4.7023,4.7023,4.7023,103.975496,20,1.060813,0
5,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,,3.78254,3.78254,9.615022,137.567944,2.901,9.2199,9.6684,258.600017,20,1.138156,0
6,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,0.344375,5.149119,9.194203,5.459122,258.446023,4.783,9.5889,4.7898,219.677992,20,1.413454,19
7,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,0.000433,4.599837,4.599837,2.959214,62.61253,4.501,4.501,2.9515,59.794441,20,1.101461,20
8,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,0.000162,3.78254,3.78254,9.615022,137.567944,3.8377,3.8377,9.0201,132.847503,20,1.119842,20
9,# generated using pymatgen\ndata_TiO2\n_symmet...,# generated using pymatgen\ndata_TiO2\n_symmet...,,5.149119,9.194203,5.459122,258.446023,5.13,9.2265,9.6622,457.330719,20,1.448874,0
