### Imports etc

In [None]:
import pandas as pd
import __init__

from _utils import build_challenge_dataframe, filter_df_to_context

### Pretrained model

#### Preprocessing
- Dataset: LeMatBulkUnique structures (April 2025)
  - Database (Material ID), Reduced Formula, CIF
- Deduplicated
- Cleaned for CIF augmentation
- Pushed to Huggingface as c-bone/lematerial_clean

#### **Warning**: Pretraining takes extremely long (3 weeks on 3 24GB GPUs)

In [None]:
# Trained on 2 GPUs
!torchrun --nproc_per_node=2 _train.py \
    --config '_config_files/training/unconditional/lematerial-small.jsonc'

### Fetching Data

Download dataset

In [None]:
# load challenge_set_v1.zip from CrystaLLM
with open('challenge_set_v1.zip', 'rb') as f:
    import zipfile
    with zipfile.ZipFile(f, 'r') as zip_ref:
        zip_ref.extractall('challenge_set_v1')

Turn it into a dataframe. We need the CIF column and the Material ID column
> Material ID serves as a unique identifier for each struct needed to assess structure recovery capability

In [None]:
build_challenge_dataframe(
    'artifacts/challenge_set/files',
    'artifacts/challenge_set/true_structs.parquet'
)

### Making the Prompts

Augment the CIFs

In [None]:
!python _utils/_preprocessing/_cleaning.py \
    --input_parquet 'artifacts/challenge_set/true_structs.parquet' \
    --output_parquet 'artifacts/challenge_set/clean.parquet' \
    --num_workers 8

Remove anything above context of pretrained model (hard generation limit)

In [None]:
df = pd.read_parquet('artifacts/challenge_set/clean.parquet')

filtered_df = filter_df_to_context(
    df,
    context=1024,
    cif_column="CIF"
)

print(f"Number of rows before filtering: {len(df)}")
print(f"Number of rows after filtering: {len(filtered_df)}")

filtered_df.to_parquet('artifacts/challenge_set/clean_filtered.parquet', index=False)

### Generating and Metrics - Composition prompting

Make Prompts at level 3, with detailed composition info as per original benchmark

In [None]:
!python _utils/_generating/make_prompts.py \
    --automatic \
    --input_df 'artifacts/challenge_set/clean_filtered.parquet' \
    --cif_column 'CIF' \
    --level 'level_3' \
    --output_parquet 'artifacts/challenge_set/challenge_prompt_test.parquet'

Generate the CIFs (T=0.7, K=10, 100 generation attempts) as per og benchmark

In [None]:
!python _utils/_generating/generate_CIFs.py \
    --config '_config_files/generation/unconditional/lematerial-challenge_eval.jsonc'

Post process to retrieve standardised CIFs

In [None]:
!python _utils/_evaluation_og/postprocess.py \
    --input_parquet '_artifacts/challenge_set/gen_07T10K.parquet' \
    --output_parquet '_artifacts/challenge_set/gen_07T10K_processed.parquet' \
    --num_workers 8

Calculate metrics (how many matches to true structs within the gen structs)

In [None]:
!python _utils/_metrics/challenge_set_metrics.py \
    --input_parquet '_artifacts/challenge_set/gen_07T10K_processed.parquet' \
    --path_to_db '_artifacts/challenge_set/true_structs.parquet' \
    --num_gens 20 \
    --num_workers 16

### Generation and prompting - Composition + Spacegroup Prompting

Same as above but spacegroup info is included in the prompts

In [None]:
!python _utils/_generating/make_prompts.py \
    --automatic \
    --input_df '_artifacts/challenge_set/clean_filtered.parquet' \
    --cif_column 'CIF' \
    --level 'level_4' \
    --output_parquet 'artifacts/challenge_set/challenge_prompt_test_sg.parquet'

In [None]:
!python _utils/_generating/generate_CIFs.py \
    --config '_config_files/generation/unconditional/lematerial-challenge-sg_eval.jsonc'

In [None]:
!python _utils/_evaluation_og/postprocess.py \
    --input_parquet '_artifacts/challenge_set/gen_sg_07T10K.parquet' \
    --output_parquet '_artifacts/challenge_set/gen_sg_07T10K_processed.parquet' \
    --num_workers 8

In [None]:
!python _utils/_metrics/challenge_set_metrics.py \
    --input_parquet '_artifacts/challenge_set/gen_sg_07T10K_processed.parquet' \
    --path_to_db '_artifacts/challenge_set/true_structs.parquet' \
    --num_gens 20 \
    --num_workers 16