In [None]:
import __init__

Note for this study its important that the Material ID is in the base df

In [None]:
import pandas as pd
df = pd.read_parquet('/home/cyprien/Data_Gen/mp-20-data/all_data.parquet')
df

In [None]:
!python _utils/_preprocessing/_calculate_XRD.py \
    --input_parquet /home/cyprien/Data_Gen/mp-20-data/all_data.parquet \
    --output_parquet HF-databases/mp_20_pxrd/mp_20_pxrd.parquet \
    --num_workers 16

In [None]:
!python _utils/_preprocessing/_cleaning.py \
    --input_parquet HF-databases/mp_20_pxrd/mp_20_pxrd.parquet \
    --output_parquet HF-databases/mp_20_pxrd/mp_20_pxrd_clean.parquet \
    --property_columns "['Condition Vector']" \
    --num_workers 16 \
    --filter_to 1024

In [None]:
!python _utils/_preprocessing/_save_dataset_to_HF.py \
    --input_parquet HF-databases/mp_20_pxrd/mp_20_pxrd_clean.parquet \
    --output_parquet HF-databases/mp_20_pxrd/mp_20_pxrd.parquet \
    --save_hub

In [None]:
!python _utils/_generating/make_prompts.py \
    --HF_dataset 'c-bone/mp_20_pxrd' \
    --split 'test' \
    --automatic \
    --output_parquet '_artifacts/mp-20-pxrd/mp-test_prompts.parquet' \
    --level 'level_3' \
    --condition_columns 'Condition Vector'
    # --remove_ref_columns

# Ablation studies on a from scratch model

In [None]:
import __init__

In [None]:
import optuna

study = optuna.load_study(
    study_name='mp_20_pxrd_scratch_HPS',
    storage='sqlite:///optuna_hpsearch_mp_20_pxrd_scratch_HPS.db'
)

# 2. Retrieve the best trial
best_trial = study.best_trial

# 3. Access the best hyperparameters and value
best_params = best_trial.params
best_value = best_trial.value

# Print the results
print(f"Best trial value: {best_value}")
print("Best hyperparameters:")
for key, value in best_params.items():
    print(f"    {key}: {value}")

In [None]:
!torchrun --nproc_per_node=2 _train.py --config '_config_files/training/conditional/xrd_studies/mp-20-pxrd-slider-scratch-opt.jsonc'

In [None]:
import pandas as pd
import __init__
df = pd.read_parquet('_artifacts/mp-20-pxrd/mp-test_prompts.parquet')

# make a random subset of 1000
df = df.sample(1000, random_state=1).reset_index(drop=True)
df.to_parquet('_artifacts/mp-20-pxrd/mp-test_prompts_1k.parquet', index=False)
# make a no condition version
df['condition_vector'] = df['condition_vector'].apply(lambda x: ','.join(['-100'] * 40))
df.to_parquet('_artifacts/mp-20-pxrd/mp-test_prompts_1k_nocond.parquet', index=False)

#### Look at the impact of perplexity ranking or feeding conditions to outputs

1. Perplexity scoring and xrd conditions
   1. Testing at diff temperatures, top-p = 0.95, top-k = 10, T = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]

In [None]:
import __init__

In [None]:
T = ['050', '075', '100', '125', '150', '175']

In [None]:
for temp in T:
    gen_config = f"_config_files/generation/conditional/xrd_studies/temp/mp-scratch-20perp-{temp}T_eval.jsonc"
    gen_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-{temp}T_gen.parquet"
    post_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-{temp}_post.parquet"
    metric_20perp_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-{temp}_metrics.parquet"
    metrics_1perp_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-1perp-{temp}_metrics.parquet"

    !python _utils/_generating/generate_CIFs.py --config {gen_config}

    !python _utils/_generating/postprocess.py \
    --input_parquet {gen_parquet} \
    --output_parquet {post_parquet} \
    --num_workers 32 

    !python _utils/_metrics/XRD_metrics.py \
    --input_parquet {post_parquet} \
    --num_gens 20 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet {metric_20perp_parquet} \
    --num_workers 24 \
    --validity_check 'diffcsp'

    !python _utils/_metrics/XRD_metrics.py \
    --input_parquet {post_parquet} \
    --num_gens 1 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet {metric_1perp_parquet} \
    --num_workers 24 \
    --validity_check 'diffcsp'

In [None]:
import __init__
from _utils import get_metrics_xrd
import pandas as pd

results = {}
for temp in T:
    # if temp == '050':
    #     continue
    metric_20perp_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-{temp}_metrics.parquet"
    metrics_1perp_parquet = f"_artifacts/mp-20-pxrd/temp/mp-20-scratch-1perp-{temp}_metrics.parquet"
    

    df_20perp = pd.read_parquet(metric_20perp_parquet)
    df_1perp = pd.read_parquet(metrics_1perp_parquet)

    results_20perp = get_metrics_xrd(df_20perp, n_test=1000, only_matched=False, verbose=False)
    results_1perp = get_metrics_xrd(df_1perp, n_test=1000, only_matched=False, verbose=False)
    results[temp] = (results_20perp, results_1perp)
    # print("\n")

table_20perp = pd.DataFrame.from_dict({temp: results[temp][0] for temp in results}, orient='index')
table_1perp = pd.DataFrame.from_dict({temp: results[temp][1] for temp in results}, orient='index')

# concatenate the two tables 20perp and 1perp one on top of the other
final_table = pd.concat([table_20perp, table_1perp], axis=0)
final_table.head(12)

2. no XRD conditions, no scoring

In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/xrd_studies/ablation/mp-xrd-scratch-uncond-20shot_eval.jsonc'

In [None]:
!python _utils/_generating/postprocess.py \
    --input_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_gen.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_post.parquet' \
    --num_workers 32 \
    --column_name 'Generated CIF'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_post.parquet' \
    --num_gens 20 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_metrics.parquet' \
    --num_workers 24 \
    --validity_check 'diffcsp'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-1shot_metrics.parquet' \
    --num_workers 24 \
    --validity_check 'diffcsp'

3. No scoring, XRD conditions

In [None]:
!python _utils/_generating/generate_CIFs.py --config '_config_files/generation/conditional/xrd_studies/ablation/mp-xrd-scratch-20shot_eval.jsonc'

In [None]:
!python _utils/_generating/postprocess.py \
    --input_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-20shot_gen.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-20shot_post.parquet' \
    --num_workers 32 \
    --column_name 'Generated CIF'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-20shot_post.parquet' \
    --num_gens 20 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/ablation/mp-20-scratch-20shot_metrics.parquet' \
    --num_workers 24 \
    --validity_check 'diffcsp'

In [None]:
!python _utils/_metrics/XRD_metrics.py \
    --input_parquet '_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-075_post.parquet' \
    --num_gens 1 \
    --ref_parquet '_artifacts/mp-20-pxrd/mp-test_ref.parquet' \
    --output_parquet '_artifacts/mp-20-pxrd/temp/mp-20-scratch-1rand-075_metrics.parquet' \
    --num_workers 24 \
    --validity_check 'diffcsp' \
    --sort_gens 'random'

### Results

In [None]:
metrics_1perp_parquet = "_artifacts/mp-20-pxrd/temp/mp-20-scratch-1perp-075_metrics.parquet"
metrics_20perp_parquet = "_artifacts/mp-20-pxrd/temp/mp-20-scratch-20perp-075_metrics.parquet"
metrics_uncond_1shot_parquet = "_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-1shot_metrics.parquet"
metrics_uncond_20shot_parquet = "_artifacts/mp-20-pxrd/ablation/mp-20-scratch-uncond-20shot_metrics.parquet"
metrics_cond_1shot_parquet = "_artifacts/mp-20-pxrd/ablation/mp-20-scratch-1shot_metrics.parquet"
metrics_cond_20shot_parquet = "_artifacts/mp-20-pxrd/ablation/mp-20-scratch-20shot_metrics.parquet"

# make a table with all the results
import __init__
from _utils import get_metrics_xrd
import pandas as pd
import numpy as np

paths = {
    'cond-20perp': metrics_20perp_parquet,
    'cond-1perp': metrics_1perp_parquet,
    'uncond-20shot': metrics_uncond_20shot_parquet,
    'uncond-1shot': metrics_uncond_1shot_parquet,
    'cond-20shot': metrics_cond_20shot_parquet,
    'cond-1shot': metrics_cond_1shot_parquet
}
results = {}

for names, path in paths.items():
    df = pd.read_parquet(path)
    metrics_result = get_metrics_xrd(df, n_test=1000, only_matched=False, verbose=False)
    results[names] = metrics_result
        
# Create final table with all results
final_table = pd.DataFrame.from_dict(results, orient='index')
final_table