In [None]:
import __init__

#### Jarvis-DFT
- Dataset: Source [JDFT](https://jarvis-materials-design.github.io/dbdocs/thedownloads/)
- Columns:
  - Database (manual)
  - **Material ID** (from source) 
  - Reduced Formula (pmg structure.composition.reduced_formula)
  - CIF (pmg - Cifwriter with symprec 0.1)

### Preprocess
- Random split 90:10 like the benchmark (no eval set)
- Generate XRD condition vectors using
  - pmg - XRDCalculator(wavelength="CuKa")
  - top 20 most intense peaks selected ($2\theta$ and int)
  - Normalisations
    - $2\theta$ min-max for 0,90
    - intensities min-max for 0,100
- Cleaned for CIF augmentation
  - note: filtered to context length for training etc. But for benchmarking we compare to all the structures in the database even if 16 were unparseable and 833 were above context length.
- saved HuggingFace as c-bone/jarvis-XRD

In [None]:
import pandas as pd
import numpy as np
np.random.seed(1)

df = pd.read_parquet('HF-databases/jarvis-XRD-processing-2/jarvis-XRD-unproc.parquet')

# Random Split for benchmarking because no train/test split provided in DiffactGPT paper
# 90% train, 10% test is the split used in paper
df['Split'] = np.random.choice(['train', 'test'], size=len(df), p=[0.9, 0.1])
df.to_parquet('HF-databases/jarvis-XRD-processing-2/jarvis-XRD-unproc.parquet', index=False)

# Make the test set parquet for benchmarking
df_test = df.copy()
df_test = df_test[df_test['Split'] == 'test']
df_test.to_parquet('_artifacts/jarvis-XRD/jarvis-test_ref.parquet', index=False)

In [None]:
!python _utils/_preprocessing/_calculate_XRD.py \
    --input_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD-unproc.parquet \
    --output_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD-unproc.parquet \
    --num_workers 32

In [None]:
!python _utils/_preprocessing/_cleaning.py \
    --input_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD-unproc.parquet \
    --output_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD-clean.parquet \
    --property_columns "['condition_vector']" \
    --num_workers 32 \
    --filter_to 1024

In [None]:
!python _utils/_preprocessing/_save_dataset_to_HF.py \
    --input_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD-clean.parquet \
    --output_parquet HF-databases/jarvis-XRD-processing-2/jarvis-XRD.parquet \
    --save_hub

### Training

In [None]:
!torchrun --nproc_per_node=2 _train.py --config '_config_files/training/conditional/xrd_studies/jarvis-xrd-slider-opt.jsonc'

### Generating

In [None]:
!python _utils/_generating/make_prompts.py \
    --HF_dataset 'c-bone/jarvis-XRD' \
    --split 'test' \
    --automatic \
    --output_parquet '_artifacts/jarvis-XRD/jarvis-test_prompts.parquet' \
    --level 'level_3' \
    --condition_columns 'condition_vector'


In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/xrd_studies/jarvis-xrd_eval.jsonc'

In [None]:
!python _utils/_generating/postprocess.py \
    --input_parquet '_artifacts/jarvis-XRD/jarvis-ft-20perp-test_gen.parquet' \
    --output_parquet '_artifacts/jarvis-XRD/jarvis-ft-20perp-test_post.parquet' \
    --num_workers 32 \
    --column_name 'Generated CIF'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/jarvis-XRD/jarvis-ft-20perp-test_post.parquet' \
    --num_gens 20 \
    --ref_parquet '_artifacts/jarvis-XRD/jarvis-test_ref.parquet' \
    --output_parquet '_artifacts/jarvis-XRD/jarvis-ft-20perp-test_metrics.parquet' \
    --num_workers 16 \
    --validity_check 'none'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/jarvis-XRD/jarvis-ft-20perp-test_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/jarvis-XRD/jarvis-test_ref.parquet' \
    --output_parquet '_artifacts/jarvis-XRD/jarvis-ft-1perp-test_metrics.parquet' \
    --num_workers 24 \
    --validity_check 'none'

Metrics

In [None]:
import __init__

In [None]:
import __init__
from _utils import get_metrics_xrd
import pandas as pd

df_test = pd.read_parquet('_artifacts/jarvis-XRD/jarvis-test_ref.parquet')
df_metrics = pd.read_parquet('_artifacts/jarvis-XRD/jarvis-ft-20perp-test_metrics.parquet')
metrics = get_metrics_xrd(df_metrics, n_test=len(df_test), only_matched=False)
df_metrics = pd.read_parquet('_artifacts/jarvis-XRD/jarvis-ft-1perp-test_metrics.parquet')
metrics = get_metrics_xrd(df_metrics, n_test=len(df_test), only_matched=False)