### Full worklflow of employing CrystaLLM-pi for materials discovery

In [None]:
import __init__ # this just ensures were in the right env
from pymatgen.io.jarvis import JarvisAtomsAdaptor as JAA
from jarvis.core.atoms import Atoms
from pymatgen.io.cif import CifWriter
import json
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

### Step 1. Make the dataset in an apropriate format
- Data source
  - 1 json file that contains materials project material ids and their associated pmg structure objects
  - 1 json file that has the material ids and associated SLMEs
  > In the field of photovoltaics (PVs), SLME stands for Spectroscopic Limited Maximum Efficiency. It's a theoretical metric used to predict the maximum possible energy conversion efficiency of a solar cell material. Its more detailed and material-specific than the widely used Shockley-Queisser limit. See [this paper](https://arxiv.org/pdf/2507.13246) for more details on ML and SLMEs

- Columns we want to finetune CrystaLLM-pi
  - We want to condition generation on SLME, to try and find novel materials with high SLMEs
  - 'Material ID': unique identifier, here for traceability of training data
  - 'Reduced Formula': This will be used to speed up novelty metric calculations
  - 'CIF': text sequences that the language model will train on
  - 'SLME': the property that the model will condition its sequence generation on

#### Lets build the dataset

In [None]:
def _symmetrize_cif(struct):
    """Convert structure to symmetrized CIF format with error handling."""
    sga = SpacegroupAnalyzer(struct)
    symm_struct = sga.get_symmetrized_structure()
    return str(CifWriter(symm_struct, symprec=0.1))
def structure_to_cif(struct):
    """Convert a pymatgen Structure to CIF format w symmetry"""
    cif_str = _symmetrize_cif(struct)
    return cif_str
def extract_formula(struct):
    """Extract the reduced formula from a pymatgen Structure."""
    return struct.composition.reduced_formula

In [None]:
with open('HF-databases/mpdb-slme/all_structures.json','r') as f:
    structure_dict = json.load(f)
with open('HF-databases/mpdb-slme/All_corr_SLMEs.json','r') as f:
    slme_dict = json.load(f)

df = pd.DataFrame(columns=['Material ID', 'Reduced Formula', 'CIF'])
for material_id, struct_dict in tqdm(structure_dict.items()):
    ats = Atoms.from_dict(struct_dict)
    structure = JAA.get_structure(ats)
    cif_str = structure_to_cif(structure)
    formula = extract_formula(structure)
    df = pd.concat([df, pd.DataFrame({'Material ID': [material_id], 'Reduced Formula': [formula], 'CIF': [cif_str]})], ignore_index=True)
    df['SLME'] = df['Material ID'].map(slme_dict)

df.to_parquet('HF-databases/mpdb-slme/mpdb-slme.parquet', index=False)

#### We can visualise the distribution of SLMEs in the finetuning set 

In [None]:
plt.hist(df['SLME'].dropna(), bins=50)
plt.xlabel('SLME')Colla
plt.ylabel('Frequency')
plt.title('Distribution of SLME values in MPDB')
plt.show()

#### Clean and augment the CIFs
- We turn the CIFs from standard into the correct format for CrystaLLM-pi training (process is invertible)
- We normalise the property we train on to stabilise training
- We also filtered so no augmented CIFs exceed context length (but none did so can skip)
> Note: We don't deduplicate because the dataset is already curated and each structure has different SLMEs so they are of interest for training. If you want to do this, see the [README.md](README.md)

In [None]:
!python _utils/_preprocessing/_cleaning.py \
    --input_parquet HF-databases/mpdb-slme/mpdb-slme.parquet \
    --output_parquet HF-databases/mpdb-slme/mpdb-slme_clean_filtered.parquet \
    --property_columns "['SLME']" \
    --property1_normaliser "linear" \
    --num_workers 8 \
    --filter_to 1024

### Save to HF
- Because we have a small dataset, we would like to train on all of it
- Datasets available on HuggingFace:
  - c-bone/mpdb-slme-full (100% - train)

In [None]:
!python _utils/_preprocessing/_save_dataset_to_HF.py \
    --input_parquet 'HF-databases/mpdb-slme/mpdb-slme_clean_filtered.parquet' \
    --output_parquet 'HF-databases/mpdb-slme/mpdb-slme-full.parquet' \
    --valid_size 0.00 \
    --test_size 0.00 \
    --save_hub

### Train the model
- I performed a quick hyperparameter search using a the same dataset but with a small validation set, best params stored in the config

In [None]:
!torchrun --nproc_per_node=2 _train.py --config '_config_files/training/conditional/ft-slme/slme_ft-PKV-opt.jsonc'

#### Make Prompts
- Here the prompts are quite simple, we target any composition but the highest possible SLME

In [None]:
!python _utils/_generating/make_prompts.py \
    --manual \
    --compositions "" \
    --condition_lists "1.0" \
    --output_parquet '_utils/_evaluation_files/conditional_studies/slme/slme-PKV-opt_prompt.parquet'

#### Materials Generation
- With default params that balance exploration and exploitation of learned patterns

In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/slme/slme-PKV-opt_eval.jsonc'

#### Metrics
- We want to flag materials that are valid, unique, novel, and predicted stable for further analysis
- We can screen this pretty fast with the metrics scripts set up

In [None]:
!python _utils/_metrics/VUN_metrics.py \
    --input_parquet '_utils/_evaluation_files/conditional_studies/slme/slme-PKV-opt_gen.parquet' \
    --huggingface_dataset 'c-bone/mpdb-slme-full' \
    --load_processed_data 'HF-databases/mpdb-slme/mpdb-slme-full_proc.parquet' \
    --output_parquet '_utils/_evaluation_files/conditional_studies/slme/slme-PKV-opt_post.parquet' \
    --num_workers 32

In [None]:
!python _utils/_metrics/mace_ehull.py \
    --post_parquet '_utils/_evaluation_files/conditional_studies/slme/slme-PKV-opt_post.parquet' \
    --output_parquet '_utils/_evaluation_files/conditional_studies/slme/slme-PKV-opt_post-s.parquet' \
    --mp_data 'mp_computed_structure_entries.json.gz' \
    --num_workers 16

In [None]:
import __init__
import pandas as pd
df = pd.read_parquet('_artifacts/slme/slme-PKV-opt_post-s.parquet')

valid_count = df['is_valid'].sum()
valid_unique_count = df[(df['is_valid']) & (df['is_unique'])].shape[0]
valid_unique_novel_count = df[(df['is_valid']) & (df['is_unique']) & (df['is_novel'])].shape[0]
low_ehull_count = df[(df['is_valid']) & (df['is_unique']) & (df['is_novel']) & (df['ehull_mace_mp'] < 0.1)].shape[0]
print(f"Valid entries: {valid_count}")
print(f"Valid and unique entries: {valid_unique_count}")
print(f"Valid, unique, and novel entries: {valid_unique_novel_count}")
print(f"Valid, unique, novel entries with ehull_mace_mp < 0.1: {low_ehull_count}")
print(f"Total entries: {len(df)}")

#### Surrogate Model Screening
- The valid, unique, novel entries were screened using an ALIGNN model trained on Hybrid bandgaps to look at the output materials
- They were also screened using an ALIGNN model that was trained on SLME data

- So for each of the VSUN materials above, we calculated a series of metrics (Surrogate BG, Surrogate SLME, Sustainability scores (via HHI scores)), then mapped them back to the structures in `_artifacts/slme/slme-PKV-opt_post-s.parquet'
  - How to get surrogate metrics are not included here (calculations performed by someone else in the group), but the HHI score is

In [None]:
import __init__
import pandas as pd

1. We get the Surrogate SLME and band-gap, and map structs to post-s df (material_id in the 'gen_merged' is the index of the post-s df)

In [None]:
df = pd.read_parquet('_artifacts/slme/slme-PKV-opt_post-s.parquet')
gen_merged = pd.read_parquet('_artifacts/slme/slme-PKV-opt_post-surrogate-metrics.parquet')

In [None]:
gen_merged

2. Add sustainability scores

In [None]:
from _utils import get_hhi_scores_from_cif
# apply to gen_merged on 'Generated CIF' column
gen_merged[['HHI_p', 'HHI_r']] = gen_merged['Generated CIF'].apply(
    lambda cif: pd.Series(get_hhi_scores_from_cif(cif))
)
gen_merged['HHI_distance_to_0'] = ((gen_merged['HHI_p']**2 + gen_merged['HHI_r']**2)**0.5)

In [None]:
gen_merged.to_parquet('_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi.parquet', index=False)
gen_merged

3. We have already filtered down to VSUN (using the FT set as reference, and calculating structural novelty. But here want to look at further metrics, so lets use the gen_merged df, and get novelt metrics when we do both structural and compositional novelty, as well as using fine-tune or pre-training dataset as the reference dataset)

In [None]:
!python _utils/_metrics/VUN_metrics.py \
    --input_parquet '_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi.parquet' \
    --huggingface_dataset 'c-bone/mpdb-slme-full' \
    --load_processed_data 'HF-databases/mpdb-slme/mpdb-slme-full_proc.parquet' \
    --output_parquet '_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi-ft-set.parquet' \
    --num_workers 32 \
    --check_comp_novelty

In [None]:
# V+U+Novel: 16463 
# V+U+CompNovel: 15462

In [None]:
!python _utils/_metrics/VUN_metrics.py \
    --input_parquet '_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi.parquet' \
    --huggingface_dataset 'c-bone/lematerial_clean' \
    --load_processed_data 'HF-databases/lematerial/lematerial_dedup.parquet' \
    --output_parquet '_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-pt-set.parquet' \
    --num_workers 32 \
    --check_comp_novelty

In [None]:
# V+U+Novel: 8712
# V+U+CompNovel: 1809

In [None]:
import __init__
import pandas as pd

df_ft = pd.read_parquet('_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi-ft-set.parquet')
df_pt = pd.read_parquet('_artifacts/slme/slme-PKV-opt_post-surrogate-metrics-hhi-pt-set.parquet')

# in df_pt, rename is_novel to is_novel_pt, and is_comp_novel to is_comp_novel_pt
df_pt = df_pt.rename(columns={'is_novel': 'is_novel_pt', 'is_comp_novel': 'is_comp_novel_pt'})

# merge df and df_pt on material_id
df_merged = df_ft.merge(df_pt[['material_id', 'is_novel_pt', 'is_comp_novel_pt']], on='material_id', how='left')

df_merged.to_parquet('_artifacts/slme/slme-PKV-opt_post-all-metrics', index=False)

In [None]:
df_merged

4. Now we save the candidates to CIFs in a directory. We give name them according to reduced formula, scoring (SLME is just highestr pred SLME, Sustain-SLME is an SLME > 25 ranked by HHI_distance_to_0), what their position is in ranking wrt metric, and what type of novelty they are.

In [None]:
import __init__
from _utils import run_material_selection

out, out = run_material_selection(
    input_parquet='_artifacts/slme/slme-PKV-opt_post-all-metrics',
    output_dir='SLME_candidates_test',
    output_csv='SLME_candidates_test.csv',
    top_n_slme=15,
    top_n_sustain=15
)

In [None]:
import __init__

!python _utils/_metrics/dft_ehull.py \
    --input_csv '_artifacts/slme/cyprien-dft-slme.csv' \
    --output_parquet '_artifacts/slme/cyprien-dft-slme-ehull.parquet' \
    --output_cif_dir '_artifacts/slme/cyprien-dft-slme-cifs/'

In [None]:
import pandas as pd
df = pd.read_parquet('_artifacts/slme/cyprien-dft-slme-ehull.parquet')
df