In [2]:
# check current directory
import os
os.getcwd()

# change directory to where the data is
os.chdir('/home/uccacbo/CrystaLLM')
os.getcwd()

'/home/uccacbo/CrystaLLM'

### Visualising what the CIFs in og. crystallm should look like

In [1]:
!python bin/download.py cifs_v1_orig.pkl.gz

downloading to ./cifs_v1_orig.pkl.gz ...
100%|███████████████████████████████████████| 645M/645M [03:20<00:00, 3.22MiB/s]
done!


In [3]:
!python bin/download.py cifs_v1_dedup.pkl.gz

downloading to ./cifs_v1_dedup.pkl.gz ...
100%|███████████████████████████████████████| 418M/418M [00:56<00:00, 7.40MiB/s]
done!


In [4]:
import pickle
import gzip

# Function to load a small portion of the data from the compressed file
def load_partial_pickle(file_path, num_entries=100):
    with gzip.open(file_path, 'rb') as f:
        # Load the entire data but extract only a portion
        try:
            # Use pickle.load() to get the full object but extract a subset
            data = pickle.load(f)
            # If the data is a list or other iterable, slice it
            return data[:num_entries]  # Adjust the slice as needed for more entries
        except Exception as e:
            print(f"Error loading pickle file: {e}")
            return None

# Path to your .pkl.gz file
file_path = 'cifs_v1_dedup.pkl.gz'

# Load and inspect a small part of the data
sample_data = load_partial_pickle(file_path, num_entries=100)

In [6]:
# display the sample data
sample_data[:5]


[('NOMAD_fpjXjxoYA_8IDwuyKiDplQkHhIST',
  "# NOMAD Entry fpjXjxoYA_8IDwuyKiDplQkHhIST C2Si2 C2Si2 I4/mmm\n# generated using pymatgen\ndata_SiC\n_symmetry_space_group_name_H-M   Fm-3m\n_cell_length_a   4.04639975\n_cell_length_b   4.04639975\n_cell_length_c   4.04639975\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   225\n_chemical_formula_structural   SiC\n_chemical_formula_sum   'Si4 C4'\n_cell_volume   66.25312330\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-y, x, z'\n  4  'y, -x, -z'\n  5  '-x, -y, z'\n  6  'x, y, -z'\n  7  'y, -x, z'\n  8  '-y, x, -z'\n  9  'x, -y, -z'\n  10  '-x, y, z'\n  11  '-y, -x, -z'\n  12  'y, x, z'\n  13  '-x, y, -z'\n  14  'x, -y, z'\n  15  'y, x, -z'\n  16  '-y, -x, z'\n  17  'z, x, y'\n  18  '-z, -x, -y'\n  19  'z, -y, x'\n  20  '-z, y, -x'\n  21  'z, -x, -y'\n  22  '-z, x, y'\n  23  '

### A dataset was created with pymatgen that has a cif file, with at the bottom of the CIF file the bandgap appended

#### Load the file

In [7]:
# turn custom cifs into tar.gz
!python bin/prepare_custom_more.py BG_CIFs/ BG_cifs_more.tar.gz

preparing CIF files...: 100%|█████████████████| 110/110 [00:03<00:00, 30.01it/s]
prepared CIF files have been saved to BG_cifs_more.tar.gz


#### turn it into pkl

In [4]:
# turn the tar.gz into pickle
!python bin/tar_to_pickle.py BG_datasets/BG_CIF_dataset_large.tar.gz BG_cifs_process_steps/BG_data_large.pkl.gz

loading data from BG_datasets/BG_CIF_dataset_large.tar.gz...
extracting files...: 100%|███████████| 126334/126334 [00:04<00:00, 30304.52it/s]
saving data to BG_cifs_process_steps/BG_data_large.pkl.gz...
conversion complete!


In [5]:
#check format still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_data_large.pkl.gz', 'rb') as f:
    BG_cifs_more = pickle.load(f)

# display the first few entries
BG_cifs_more[:1]

[('RbLaTa2O7_mp-541600',
  "# generated using pymatgen\ndata_RbLaTa2O7\n_symmetry_space_group_name_H-M   P4/mmm\n_cell_length_a   3.91039400\n_cell_length_b   3.91039400\n_cell_length_c   11.31445800\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   123\n_chemical_formula_structural   RbLaTa2O7\n_chemical_formula_sum   'Rb1 La1 Ta2 O7'\n_cell_volume   173.01142786\n_cell_formula_units_Z   1\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-y, x, z'\n  4  'y, -x, -z'\n  5  '-x, -y, z'\n  6  'x, y, -z'\n  7  'y, -x, z'\n  8  '-y, x, -z'\n  9  'x, -y, -z'\n  10  '-x, y, z'\n  11  '-y, -x, -z'\n  12  'y, x, z'\n  13  '-x, y, -z'\n  14  'x, -y, z'\n  15  'y, x, -z'\n  16  '-y, -x, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n 

#### make sure deduplicate still works

In [6]:
# deduplicate the data
!python bin/deduplicate.py BG_cifs_process_steps/BG_data_large.pkl.gz --out BG_cifs_process_steps/BG_large_dedup.pkl.gz

loading data from BG_cifs_process_steps/BG_data_large.pkl.gz...
number of CIFs to deduplicate: 126,334
100%|███████████████████████████████| 126334/126334 [00:00<00:00, 176292.31it/s]
number of entries to write: 114,751
saving data to BG_cifs_process_steps/BG_large_dedup.pkl.gz...


In [12]:
# check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz', 'rb') as f:
    BG_cifs_more_dedup = pickle.load(f)

# display the first few entries
BG_cifs_more_dedup[:1]

[('Fe(PO3)4',
  "# generated using pymatgen\ndata_Fe(PO3)4\n_symmetry_space_group_name_H-M   C2/c\n_cell_length_a   12.81796307\n_cell_length_b   7.94044656\n_cell_length_c   10.21684027\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   117.82872259\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   15\n_chemical_formula_structural   Fe(PO3)4\n_chemical_formula_sum   'Fe4 P16 O48'\n_cell_volume   919.60915179\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-x, y, -z+1/2'\n  4  'x, -y, z+1/2'\n  5  'x+1/2, y+1/2, z'\n  6  '-x+1/2, -y+1/2, -z'\n  7  '-x+1/2, y+1/2, -z+1/2'\n  8  'x+1/2, -y+1/2, z+1/2'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Fe  Fe0  4  0.25000000  0.25000000  0.00000000  1.0\n  P  P1  8  0.00775150  0.25477350  0.01545650  1.0\n  P  P2  

#### Data augmentation with the preprocess script

In [7]:
#Data augmentation preprocess step
!python bin/preprocess_more.py BG_cifs_process_steps/BG_large_dedup.pkl.gz --out BG_cifs_process_steps/BG_large_prep.pkl.gz --workers 4

loading data from BG_cifs_process_steps/BG_large_dedup.pkl.gz...
100%|█████████████████████████████████| 114751/114751 [01:01<00:00, 1870.30it/s]
number of CIFs: 114744
saving data to BG_cifs_process_steps/BG_large_prep.pkl.gz...


In [9]:
# check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_large_prep.pkl.gz', 'rb') as f:
    BG_cifs_more_prep = pickle.load(f)

# display the first few entries
BG_cifs_more_prep[:1]

[('Te2Mo3W(SeS2)2_mp-1030462',
  "# generated using pymatgen\ndata_Te2Mo3W1Se2S4\nloop_\n _atom_type_symbol\n _atom_type_electronegativity\n _atom_type_radius\n _atom_type_ionic_radius\n  Te  2.1000  1.4000  1.2933\n  Mo  2.1600  1.4500  0.7750\n  W  2.3600  1.3500  0.7667\n  Se  2.5500  1.1500  1.0133\n  S  2.5800  1.0000  0.8800\n_symmetry_space_group_name_H-M   P3m1\n_cell_length_a   3.3105\n_cell_length_b   3.3105\n_cell_length_c   37.2684\n_cell_angle_alpha   90.0000\n_cell_angle_beta   90.0000\n_cell_angle_gamma   120.0000\n_symmetry_Int_Tables_number   156\n_chemical_formula_structural   Te2Mo3W(SeS2)2\n_chemical_formula_sum   'Te2 Mo3 W1 Se2 S4'\n_cell_volume   353.7148\n_cell_formula_units_Z   1\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Te  Te0  1  0.6667  0.3333  0

#### Split to the train/val/test sets

In [10]:
# split dataset into train and test and val
!python bin/split.py BG_cifs_process_steps/BG_large_prep.pkl.gz \
--train_out BG_cifs_process_steps/BG_large_train.pkl.gz \
--val_out BG_cifs_process_steps/BG_large_val.pkl.gz \
--test_out BG_cifs_process_steps/BG_large_test.pkl.gz \
--validation_size 0.1 \
--test_size 0.05

loading data from BG_cifs_process_steps/BG_large_prep.pkl.gz...
splitting dataset...
number of CIFs in train set: 98,105
number of CIFs in validation set: 10,901
number of CIFs in test set: 5,738
writing train set...
writing validation set...
writing test set...


In [16]:
# check current directory
import os
os.getcwd()

# change directory to where the data is
os.chdir('/home/uccacbo/CrystaLLM')
os.getcwd()

'/home/uccacbo/CrystaLLM'

In [11]:
#  check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_large_train.pkl.gz', 'rb') as f:
    BG_cifs_more_test = pickle.load(f)

# display the first few entries
BG_cifs_more_test[:1]

[('KFeCo(PO4)2_mp-1194524',
  "# generated using pymatgen\ndata_K4Fe4Co4P8O32\nloop_\n _atom_type_symbol\n _atom_type_electronegativity\n _atom_type_radius\n _atom_type_ionic_radius\n  K  0.8200  2.2000  1.5200\n  Fe  1.8300  1.4000  0.8525\n  Co  1.8800  1.3500  0.7683\n  P  2.1900  1.0000  0.5500\n  O  3.4400  0.6000  1.2600\n_symmetry_space_group_name_H-M   P2_1/c\n_cell_length_a   5.2211\n_cell_length_b   14.6317\n_cell_length_c   9.3914\n_cell_angle_alpha   90.0000\n_cell_angle_beta   104.7150\n_cell_angle_gamma   90.0000\n_symmetry_Int_Tables_number   14\n_chemical_formula_structural   KFeCo(PO4)2\n_chemical_formula_sum   'K4 Fe4 Co4 P8 O32'\n_cell_volume   693.9139\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  K  K0  4  0.1790  0.1749  0.3158  1

#### Tokenizing the data

In [12]:
# try to tokenize the data
!python bin/tokenize_cifs.py \
--train_fname BG_cifs_process_steps/BG_large_train.pkl.gz \
--val_fname BG_cifs_process_steps/BG_large_val.pkl.gz \
--out_dir BG_large_tokens/ \
--workers 4

loading data from BG_cifs_process_steps/BG_large_train.pkl.gz...
loading data from BG_cifs_process_steps/BG_large_val.pkl.gz...
preparing files...: 100%|██████████████| 98105/98105 [00:01<00:00, 80192.77it/s]
preparing files...: 100%|██████████████| 10901/10901 [00:00<00:00, 79395.89it/s]
tokenizing...: 100%|████████████████████| 98105/98105 [00:31<00:00, 3138.52it/s]
train min tokenized length: 197
train max tokenized length: 6,930
train mean tokenized length: 578.89 +/- 462.86
train total unk counts: 0
tokenizing...: 100%|████████████████████| 10901/10901 [00:10<00:00, 1057.63it/s]
val min tokenized length: 200
val max tokenized length: 6,451
val mean tokenized length: 580.75 +/- 488.00
val total unk counts: 0
concatenating train tokens...: 100%|██| 98105/98105 [00:00<00:00, 120095.20it/s]
concatenating val tokens...: 100%|████| 10901/10901 [00:00<00:00, 508113.75it/s]
encoding...
train has 56,791,747 tokens
val has 6,330,790 tokens
vocab size: 372
exporting to .bin files...
creating

In [15]:
# reload modules and import
import numpy as np
import pickle
import importlib
from crystallm import _tokenizer

# Reload the CIFTokenizer module to apply changes
importlib.reload(_tokenizer)

# Import the CIFTokenizer class
from crystallm._tokenizer import CIFTokenizer

# Load tokenized data
train_data_path = "BG_large_tokens/train.bin"
val_data_path = "BG_large_tokens/val.bin"
meta_path = "BG_large_tokens/meta.pkl"

# Load the tokenizer and tokenized data
tokenizer = CIFTokenizer()

# Load metadata (stoi, itos)
with open(meta_path, "rb") as f:
    meta = pickle.load(f)

stoi = meta['stoi']  # String to integer token mapping
itos = meta['itos']  # Integer to string token mapping

# Load tokenized train data
train_data = np.fromfile(train_data_path, dtype=np.uint16)
val_data = np.fromfile(val_data_path, dtype=np.uint16)

# Decode part of the train data to check for bandgap token
decoded_train = tokenizer.decode(train_data[:3000])  # Decode the first 500 tokens as an example
print("Decoded train data:", decoded_train[:2000])

# # Similarly, you can decode part of the val data
# decoded_val = tokenizer.decode(val_data[:500])
# print("Decoded val data:", decoded_val)


Decoded train data: data_Mg8V8Bi8O40
loop_
_atom_type_symbol
_atom_type_electronegativity
_atom_type_radius
_atom_type_ionic_radius
Mg 1.3100 1.5000 0.8600
V 1.6300 1.3500 0.7775
Bi 2.0200 1.6000 1.0350
O 3.4400 0.6000 1.2600
_symmetry_space_group_name_H-M Pbca
_cell_length_a 5.2794
_cell_length_b 15.6039
_cell_length_c 10.7107
_cell_angle_alpha 90.0000
_cell_angle_beta 90.0000
_cell_angle_gamma 90.0000
_symmetry_Int_Tables_number 61
_chemical_formula_structural MgVBiO5
_chemical_formula_sum 'Mg8 V8 Bi8 O40'
_cell_volume 882.3340
_cell_formula_units_Z 8
loop_
_symmetry_equiv_pos_site_id
_symmetry_equiv_pos_as_xyz
1 'x, y, z'
loop_
_atom_site_type_symbol
_atom_site_label
_atom_site_symmetry_multiplicity
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_occupancy
Mg Mg0 8 0.0062 0.5464 0.8627 1.0
V V1 8 0.0146 0.0919 0.1011 1.0
Bi Bi2 8 0.0221 0.7259 0.6464 1.0
O O3 8 0.0732 0.1930 0.1719 1.0
O O4 8 0.1379 0.5202 0.3022 1.0
O O5 8 0.1882 0.6176 0.5250 1.0
O O6 8 0.1969 

#### downloaded from the original package where I should be at to check all formats are still good

In [71]:
!python bin/download.py cifs_v1_test.tar.gz

downloading to ./cifs_v1_test.tar.gz ...
100%|█████████████████████████████████████| 1.64M/1.64M [00:00<00:00, 5.42MiB/s]
done!


In [72]:
!python bin/tar_to_pickle.py cifs_v1_test.tar.gz cifs_v1_test.pkl.gz

loading data from cifs_v1_test.tar.gz...
extracting files...: 100%|████████████| 10286/10286 [00:00<00:00, 115485.19it/s]
saving data to cifs_v1_test.pkl.gz...
conversion complete!


In [73]:
!python bin/tokenize_cifs.py \
--train_fname cifs_v1_test.pkl.gz \
--out_dir tokens_v1_train_val/ \
--workers 4

loading data from cifs_v1_test.pkl.gz...
preparing files...: 100%|█████████████| 10286/10286 [00:00<00:00, 143487.29it/s]
tokenizing...: 100%|████████████████████| 10286/10286 [00:02<00:00, 4516.71it/s]
train min tokenized length: 191
train max tokenized length: 4,797
train mean tokenized length: 343.38 +/- 122.25
train total unk counts: 0
concatenating train tokens...: 100%|██| 10286/10286 [00:00<00:00, 513767.65it/s]
encoding...
train has 3,531,957 tokens
vocab size: 372
exporting to .bin files...
creating tar.gz archive...
tarball created at tokens_v1_train_val/tokens_v1_train_val.tar.gz


In [74]:
# reload modules and import
import numpy as np
import pickle
import importlib
from crystallm import _tokenizer

# Reload the CIFTokenizer module to apply changes
importlib.reload(_tokenizer)

# Import the CIFTokenizer class
from crystallm._tokenizer import CIFTokenizer

# Load tokenized data
train_data_path = "tokens_v1_train_val/train.bin"
meta_path = "tokens_v1_train_val/meta.pkl"

# Load the tokenizer and tokenized data
tokenizer = CIFTokenizer()

# Load metadata (stoi, itos)
with open(meta_path, "rb") as f:
    meta = pickle.load(f)

stoi = meta['stoi']  # String to integer token mapping
itos = meta['itos']  # Integer to string token mapping

# Load tokenized train data
train_data = np.fromfile(train_data_path, dtype=np.uint16)
val_data = np.fromfile(val_data_path, dtype=np.uint16)

# Decode part of the train data to check for bandgap token
decoded_train = tokenizer.decode(train_data[:600])  # Decode the first 500 tokens as an example
print("Decoded train data:", decoded_train)

# # Similarly, you can decode part of the val data
# decoded_val = tokenizer.decode(val_data[:500])
# print("Decoded val data:", decoded_val)


Decoded train data: data_Co4B2Os2
loop_
_atom_type_symbol
_atom_type_electronegativity
_atom_type_radius
_atom_type_ionic_radius
Co 1.8800 1.3500 0.7683
B 2.0400 0.8500 0.4100
Os 2.2000 1.3000 0.6730
_symmetry_space_group_name_H-M I-4m2
_cell_length_a 3.6393
_cell_length_b 3.6393
_cell_length_c 6.2877
_cell_angle_alpha 90.0000
_cell_angle_beta 90.0000
_cell_angle_gamma 90.0000
_symmetry_Int_Tables_number 119
_chemical_formula_structural Co2BOs
_chemical_formula_sum 'Co4 B2 Os2'
_cell_volume 83.2785
_cell_formula_units_Z 2
loop_
_symmetry_equiv_pos_site_id
_symmetry_equiv_pos_as_xyz
1 'x, y, z'
loop_
_atom_site_type_symbol
_atom_site_label
_atom_site_symmetry_multiplicity
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_occupancy
Co Co0 2 0.0000 0.0000 0.0000 1
Co Co1 2 0.0000 0.5000 0.7500 1
B B2 2 0.0000 0.5000 0.2500 1
Os Os3 2 0.0000 0.0000 0.5000 1

data_Rb16Ni8O32
loop_
_atom_type_symbol
_atom_type_electronegativity
_atom_type_radius
_atom_type_ionic_radius
Rb 0

#### make sure the check start indices script still works

In [3]:
# Identifying start indices
!python bin/identify_starts.py \
--dataset_fname compressed_datasets_models/BG_large_tokens.tar.gz \
--out_fname CIF_BG_proj/BG_large_tokens/starts.pkl

identifying starts...: 100%|████| 56791747/56791747 [01:29<00:00, 633529.56it/s]
writing start indices...


In [4]:
# view the start indices
import pickle

# Load the start indices with pickle not gzip
with open("CIF_BG_proj/BG_large_tokens/starts.pkl", "rb") as f:
    starts = pickle.load(f)

# Display the first few start indices
starts[:5]


[0, 519, 849, 1245, 1509]

#### data tokenized - all the adapted scriptsd are 'script'_more.py