### Visualising what the CIFs in og. crystallm should look like

In [1]:
!python bin/download.py cifs_v1_orig.pkl.gz

downloading to ./cifs_v1_orig.pkl.gz ...
100%|███████████████████████████████████████| 645M/645M [03:20<00:00, 3.22MiB/s]
done!


In [1]:
!python bin/download.py cifs_v1_dedup.pkl.gz

downloading to ./cifs_v1_dedup.pkl.gz ...
100%|███████████████████████████████████████| 418M/418M [02:04<00:00, 3.35MiB/s]
done!


In [3]:
import pickle
import gzip

# Function to load a small portion of the data from the compressed file
def load_partial_pickle(file_path, num_entries=100):
    with gzip.open(file_path, 'rb') as f:
        # Load the entire data but extract only a portion
        try:
            # Use pickle.load() to get the full object but extract a subset
            data = pickle.load(f)
            # If the data is a list or other iterable, slice it
            return data[:num_entries]  # Adjust the slice as needed for more entries
        except Exception as e:
            print(f"Error loading pickle file: {e}")
            return None

# Path to your .pkl.gz file
file_path = 'cifs_v1_dedup.pkl.gz'

# Load and inspect a small part of the data
sample_data = load_partial_pickle(file_path, num_entries=100)

In [6]:
# display the sample data
sample_data[:5]


[('NOMAD_fpjXjxoYA_8IDwuyKiDplQkHhIST',
  "# NOMAD Entry fpjXjxoYA_8IDwuyKiDplQkHhIST C2Si2 C2Si2 I4/mmm\n# generated using pymatgen\ndata_SiC\n_symmetry_space_group_name_H-M   Fm-3m\n_cell_length_a   4.04639975\n_cell_length_b   4.04639975\n_cell_length_c   4.04639975\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   225\n_chemical_formula_structural   SiC\n_chemical_formula_sum   'Si4 C4'\n_cell_volume   66.25312330\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-y, x, z'\n  4  'y, -x, -z'\n  5  '-x, -y, z'\n  6  'x, y, -z'\n  7  'y, -x, z'\n  8  '-y, x, -z'\n  9  'x, -y, -z'\n  10  '-x, y, z'\n  11  '-y, -x, -z'\n  12  'y, x, z'\n  13  '-x, y, -z'\n  14  'x, -y, z'\n  15  'y, x, -z'\n  16  '-y, -x, z'\n  17  'z, x, y'\n  18  '-z, -x, -y'\n  19  'z, -y, x'\n  20  '-z, y, -x'\n  21  'z, -x, -y'\n  22  '-z, x, y'\n  23  '

### A dataset was created with pymatgen that has a cif file, with at the bottom of the CIF file the bandgap appended

#### Load the file

In [7]:
# turn custom cifs into tar.gz
!python bin/prepare_custom_more.py BG_CIFs/ BG_cifs_more.tar.gz

preparing CIF files...: 100%|█████████████████| 110/110 [00:03<00:00, 30.01it/s]
prepared CIF files have been saved to BG_cifs_more.tar.gz


#### turn it into pkl

In [8]:
# turn the tar.gz into pickle
!python bin/tar_to_pickle.py BG_cifs_more.tar.gz BG_cifs_more.pkl.gz

loading data from BG_cifs_more.tar.gz...
extracting files...: 100%|█████████████████| 110/110 [00:00<00:00, 61937.63it/s]
saving data to BG_cifs_more.pkl.gz...
conversion complete!


In [9]:
#check format still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_more.pkl.gz', 'rb') as f:
    BG_cifs_more = pickle.load(f)

# display the first few entries
BG_cifs_more[:1]

[('Fe(PO3)4',
  "# generated using pymatgen\ndata_Fe(PO3)4\n_symmetry_space_group_name_H-M   C2/c\n_cell_length_a   12.81796307\n_cell_length_b   7.94044656\n_cell_length_c   10.21684027\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   117.82872259\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   15\n_chemical_formula_structural   Fe(PO3)4\n_chemical_formula_sum   'Fe4 P16 O48'\n_cell_volume   919.60915179\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-x, y, -z+1/2'\n  4  'x, -y, z+1/2'\n  5  'x+1/2, y+1/2, z'\n  6  '-x+1/2, -y+1/2, -z'\n  7  '-x+1/2, y+1/2, -z+1/2'\n  8  'x+1/2, -y+1/2, z+1/2'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Fe  Fe0  4  0.25000000  0.25000000  0.00000000  1.0\n  P  P1  8  0.00775150  0.25477350  0.01545650  1.0\n  P  P2  

#### make sure deduplicate still works

In [10]:
# deduplicate the data
!python bin/deduplicate.py BG_cifs_process_steps/BG_cifs_more.pkl.gz --out BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz

loading data from BG_cifs_process_steps/BG_cifs_more.pkl.gz...
number of CIFs to deduplicate: 110
100%|█████████████████████████████████████| 110/110 [00:00<00:00, 116892.18it/s]
number of entries to write: 110
saving data to BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz...


In [12]:
# check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz', 'rb') as f:
    BG_cifs_more_dedup = pickle.load(f)

# display the first few entries
BG_cifs_more_dedup[:1]

[('Fe(PO3)4',
  "# generated using pymatgen\ndata_Fe(PO3)4\n_symmetry_space_group_name_H-M   C2/c\n_cell_length_a   12.81796307\n_cell_length_b   7.94044656\n_cell_length_c   10.21684027\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   117.82872259\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   15\n_chemical_formula_structural   Fe(PO3)4\n_chemical_formula_sum   'Fe4 P16 O48'\n_cell_volume   919.60915179\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\n  2  '-x, -y, -z'\n  3  '-x, y, -z+1/2'\n  4  'x, -y, z+1/2'\n  5  'x+1/2, y+1/2, z'\n  6  '-x+1/2, -y+1/2, -z'\n  7  '-x+1/2, y+1/2, -z+1/2'\n  8  'x+1/2, -y+1/2, z+1/2'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Fe  Fe0  4  0.25000000  0.25000000  0.00000000  1.0\n  P  P1  8  0.00775150  0.25477350  0.01545650  1.0\n  P  P2  

#### Data augmentation with the preprocess script

In [13]:
#Data augmentation preprocess step
!python bin/preprocess_more.py BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz --out BG_cifs_process_steps/BG_cifs_more_prep.pkl.gz --workers 4

loading data from BG_cifs_process_steps/BG_cifs_more_dedup.pkl.gz...
100%|███████████████████████████████████████| 110/110 [00:00<00:00, 2035.06it/s]
number of CIFs: 110
saving data to BG_cifs_process_steps/BG_cifs_more_prep.pkl.gz...


In [14]:
# check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_cifs_more_prep.pkl.gz', 'rb') as f:
    BG_cifs_more_prep = pickle.load(f)

# display the first few entries
BG_cifs_more_prep[:1]

[('LiVF5',
  "# generated using pymatgen\ndata_Li4V4F20\nloop_\n _atom_type_symbol\n _atom_type_electronegativity\n _atom_type_radius\n _atom_type_ionic_radius\n  Li  0.9800  1.4500  0.9000\n  V  1.6300  1.3500  0.7775\n  F  3.9800  0.5000  0.7050\n_symmetry_space_group_name_H-M   P2_1/c\n_cell_length_a   5.2258\n_cell_length_b   10.0573\n_cell_length_c   6.6457\n_cell_angle_alpha   90.0000\n_cell_angle_beta   106.5155\n_cell_angle_gamma   90.0000\n_symmetry_Int_Tables_number   14\n_chemical_formula_structural   LiVF5\n_chemical_formula_sum   'Li4 V4 F20'\n_cell_volume   334.8689\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Li  Li0  4  0.0550  0.2435  0.3843  1.0\n  V  V1  4  0.4457  0.0312  0.7439  1.0\n  F  F2  4  0.1567  0.5843  0.1951  1.0\n  F  F

#### Split to the train/val/test sets

In [15]:
# split dataset into train and test and val
!python bin/split.py BG_cifs_process_steps/BG_cifs_more_prep.pkl.gz \
--train_out BG_cifs_process_steps/BG_cifs_more_train.pkl.gz \
--val_out BG_cifs_process_steps/BG_cifs_more_val.pkl.gz \
--test_out BG_cifs_process_steps/BG_cifs_more_test.pkl.gz \
--validation_size 0.1 \
--test_size 0.05

loading data from BG_cifs_process_steps/BG_cifs_more_prep.pkl.gz...
splitting dataset...
number of CIFs in train set: 93
number of CIFs in validation set: 11
number of CIFs in test set: 6
writing train set...
writing validation set...
writing test set...


In [16]:
# check current directory
import os
os.getcwd()

# change directory to where the data is
os.chdir('/home/uccacbo/CrystaLLM')
os.getcwd()

'/home/uccacbo/CrystaLLM'

In [27]:
#  check the format is still correct
import pickle
import gzip

# load the data
with gzip.open('BG_cifs_process_steps/BG_cifs_more_train.pkl.gz', 'rb') as f:
    BG_cifs_more_test = pickle.load(f)

# display the first few entries
BG_cifs_more_test[:1]

[('CeCoGe2Ru',
  "# generated using pymatgen\ndata_Ce2Co2Ge4Ru2\nloop_\n _atom_type_symbol\n _atom_type_electronegativity\n _atom_type_radius\n _atom_type_ionic_radius\n  Ce  1.1200  1.8500  1.0800\n  Co  1.8800  1.3500  0.7683\n  Ge  2.0100  1.2500  0.7700\n  Ru  2.2000  1.3000  0.6610\n_symmetry_space_group_name_H-M   I-4m2\n_cell_length_a   4.1441\n_cell_length_b   4.1441\n_cell_length_c   10.2253\n_cell_angle_alpha   90.0000\n_cell_angle_beta   90.0000\n_cell_angle_gamma   90.0000\n_symmetry_Int_Tables_number   119\n_chemical_formula_structural   CeCoGe2Ru\n_chemical_formula_sum   'Ce2 Co2 Ge4 Ru2'\n_cell_volume   175.6043\n_cell_formula_units_Z   2\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Ce  Ce0  2  0.0000  0.0000  0.0000  1.0\n  Co  Co1  2  0.0000  0.5000  0.2500  1.

#### Tokenizing the data

In [24]:
# try to tokenize the data
!python bin/tokenize_cifs.py \
--train_fname BG_cifs_process_steps/BG_cifs_more_train.pkl.gz \
--val_fname BG_cifs_process_steps/BG_cifs_more_val.pkl.gz \
--out_dir BG_cifs_more_tokens/ \
--workers 4

loading data from BG_cifs_process_steps/BG_cifs_more_train.pkl.gz...
loading data from BG_cifs_process_steps/BG_cifs_more_val.pkl.gz...
preparing files...: 100%|████████████████████| 93/93 [00:00<00:00, 90756.23it/s]
preparing files...: 100%|████████████████████| 11/11 [00:00<00:00, 71977.14it/s]
tokenizing...: 100%|██████████████████████████| 93/93 [00:00<00:00, 1616.05it/s]
train min tokenized length: 264
train max tokenized length: 3,204
train mean tokenized length: 548.66 +/- 430.39
train total unk counts: 0
tokenizing...: 100%|███████████████████████████| 11/11 [00:00<00:00, 530.36it/s]
val min tokenized length: 326
val max tokenized length: 1,576
val mean tokenized length: 621.45 +/- 387.84
val total unk counts: 0
concatenating train tokens...: 100%|████████| 93/93 [00:00<00:00, 206059.31it/s]
concatenating val tokens...: 100%|██████████| 11/11 [00:00<00:00, 189865.61it/s]
encoding...
train has 51,025 tokens
val has 6,836 tokens
vocab size: 372
exporting to .bin files...
creating

In [28]:
# reload modules and import
import numpy as np
import pickle
import importlib
from crystallm import _tokenizer

# Reload the CIFTokenizer module to apply changes
importlib.reload(_tokenizer)

# Import the CIFTokenizer class
from crystallm._tokenizer import CIFTokenizer

# Load tokenized data
train_data_path = "BG_cifs_more_tokens/train.bin"
val_data_path = "BG_cifs_more_tokens/val.bin"
meta_path = "BG_cifs_more_tokens/meta.pkl"

# Load the tokenizer and tokenized data
tokenizer = CIFTokenizer()

# Load metadata (stoi, itos)
with open(meta_path, "rb") as f:
    meta = pickle.load(f)

stoi = meta['stoi']  # String to integer token mapping
itos = meta['itos']  # Integer to string token mapping

# Load tokenized train data
train_data = np.fromfile(train_data_path, dtype=np.uint16)
val_data = np.fromfile(val_data_path, dtype=np.uint16)

# Decode part of the train data to check for bandgap token
decoded_train = tokenizer.decode(train_data[:3000])  # Decode the first 500 tokens as an example
print("Decoded train data:", train_data[:1000])

# # Similarly, you can decode part of the val data
# decoded_val = tokenizer.decode(val_data[:500])
# print("Decoded val data:", decoded_val)


Decoded train data: [124  64  92   1  95  12  98 143 123 143 126 143 127 143 128 143 129 143
  64 142  89 134  96  98  89  89 142  91 134  95  89  89  89 142  90 134
  97  90  89  89 143   1 142  91 134  94  94  89  89 142  89 134  96  89
  89  89 142  89 134  92  89  89  89 143  12 142  92 134  89  93  89  89
 142  89 134  95  94  89  89 142  89 134  95  92  89  89 143 107 142 198
 143 102 142  93 134  97  98  94  97 143  99 142  93 134  97  98  94  97
 143 116 142  91  92 134  93  97  93  97 143 121 142  98  89 134  89  89
  89  89 143 103 142  98  89 134  89  89  89  89 143 105 142  90  91  89
 134  89  89  89  89 143 108 142  90  94  91 143 109 142  64   1  91  12
  92 143 113 142 140  64  92 142   1  95 142  12  98 140 143 119 142  93
  97  96 134  93  98  98  95 143 122 142  92 143 123 143 118 143 104 143
  90 142 140 131 141 142 132 141 142 133 140 143 123 143 115 143 114 143
 112 143 106 143 111 143 120 143 100 143  64 142  64  89 142  92 142  89
 134  89  89  89  89 142  89 13

In [29]:
# Assuming `CIFTokenizer` class has been defined as you provided

# Initialize tokenizer
tokenizer = CIFTokenizer()

# Load your first 1000 tokens of CIF data as a string
# Replace 'first_1000_tokens_data' with the actual string representing the first 1000 tokens
first_1000_tokens_data = """
124  64  92   1  95  12  98 143 123 143 126 143 127 143 128 143 129 143
  64 142  89 134  96  98  89  89 142  91 134  95  89  89  89 142  90 134
  97  90  89  89 143   1 142  91 134  94  94  89  89 142  89 134  96  89
  89  89 142  89 134  92  89  89  89 143  12 142  92 134  89  93  89  89
 142  89 134  95  94  89  89 142  89 134  95  92  89  89 143 107 142 198
 143 102 142  93 134  97  98  94  97 143  99 142  93 134  97  98  94  97
 143 116 142  91  92 134  93  97  93  97 143 121 142  98  89 134  89  89
  89  89 143 103 142  98  89 134  89  89  89  89 143 105 142  90  91  89
 134  89  89  89  89 143 108 142  90  94  91 143 109 142  64   1  91  12
  92 143 113 142 140  64  92 142   1  95 142  12  98 140 143 119 142  93
  97  96 134  93  98  98  95 143 122 142  92 143 123 143 118 143 104 143
  90 142 140 131 141 142 132 141 142 133 140 143 123 143 115 143 114 143
 112 143 106 143 111 143 120 143 100 143  64 142  64  89 142  92 142  89
 134  89  89  89  89 142  89 134  92  89  89  94 142  89 134  95  95  95
  96 142  90 134  89 143   1 142   1  90 142  95 142  89 134  89  91  89
  89 142  89 134  92  89  90  98 142  89 134  90  90  96  89 142  90 134
  89 143  12 142  12  91 142  95 142  89 134  89  92  97  92 142  89 134
  92  97  91  91 142  89 134  89  95  97  96 142  90 134  89 143  12 142
  12  92 142  92 142  89 134  89  89  89  89 142  89 134  90  97  89  98
 142  89 134  90  95  95  96 142  90 134  89 143 125 142  93 134  89  96
 143 143 124  63  91  96  22  98  32  90  97   7  96  91 143 123 143 126
 143 127 143 128 143 129 143  63 142  89 134  98  97  89  89 142  90 134
  93  94  89  89 142  89 134  98  89  89  89 143  22 142  90 134  95  95
  89  89 142  90 134  93  89  89  89 142  89 134  98  93  89  89 143  32
 142  91 134  90  98  89  89 142  90 134  89  89  89  89 142  89 134  94
  94  89  89 143   7 142  92 134  93  93  89  89 142  89 134  95  89  89
  89 142  90 134  91  95  89  89 143 107 142 334 143 102 142  97 134  97
  97  93  96 143  99 142  97 134  97  97  93  96 143 116 142  91  89 134
  89  92  95  96 143 121 142  98  89 134  89  89  89  89 143 103 142  98
  89 134  89  89  89  89 143 105 142  90  91  89 134  89  89  89  89 143
 108 142  90  93  95 143 109 142  63  92  22 135  32   7  93 136  91 143
 113 142 140  63  91  96 142  22  98 142  32  90  97 142   7  96  91 140
 143 119 142  90  92  95  98 134  96  94  90  92 143 122 142  98 143 123
 143 118 143 104 143  90 142 140 131 141 142 132 141 142 133 140 143 123
 143 115 143 114 143 112 143 106 143 111 143 120 143 100 143  63 142  63
  89 142  98 142  89 134  89  93  96  90 142  89 134  95  90  94  93 142
  89 134  96  94  96  97 142  90 134  89 143  63 142  63  90 142  98 142
  89 134  89  96  89  98 142  89 134  92  92  97  97 142  89 134  94  97
  91  93 142  90 134  89 143  63 142  63  91 142  92 142  89 134  89  89
  89  89 142  89 134  89  89  89  89 142  89 134  89  89  98  94 142  90
 134  89 143  63 142  63  92 142  92 142  89 134  89  89  89  89 142  89
 134  89  89  89  89 142  89 134  90  94  91  92 142  90 134  89 143  63
 142  63  93 142  92 142  89 134  89  89  89  89 142  89 134  89  89  89
  89 142  89 134  93  98  96  95 142  90 134  89 143  22 142  22  94 142
  92 142  89 134  89  89  89  89 142  89 134  89  89  89  89 142  89 134
  92  94  95  95 142  90 134  89 143  22 142  22  95 142  92 142  89 134
  89  89  89  89 142  89 134  89  89  89  89 142  89 134  95  93  90  97
 142  90 134  89 143  22 142  22  96 142  92 142  89 134  89  89  89  89
 142  89 134  89  89  89  89 142  89 134  97  94  96  91 142  90 134  89
 143  32 142  32  97 142  98 142  89 134  89  92  94  89 142  89 134  95
  95  97  92 142  89 134  98  90  93  98 142  90 134  89 143  32 142  32
  98 142  98 142  89 134  89  92  97  95 142  89 134  92  95  97  98 142
  89 134  93  90  94  89 142  90 134  89 143   7 142   7  90  89 142  98
 142  89 134  89  89  89  96 142  89 134  97  91  93  90 142  89 134  94
  96  97  92 142  90 134  89 143   7 142   7  90  90 142  98 142  89 134
  89  89  93  95 142  89 134  96  98  91
"""

# Tokenize the CIF data
tokens = tokenizer.tokenize_cif(first_1000_tokens_data)

# Print the tokens to check for double newlines
print("Tokens:", tokens)

# Check for consecutive newlines in the tokens
for i in range(1, len(tokens)):
    if tokens[i] == '\n' and tokens[i - 1] == '\n':
        print(f"Found consecutive newlines at token index: {i-1} and {i}")
        break
else:
    print("No consecutive newlines found in the first 1000 tokens.")


Tokens: ['\n', '1', '2', '4', ' ', '6', '4', ' ', '9', '2', ' ', '1', ' ', '9', '5', ' ', '1', '2', ' ', '9', '8', ' ', '1', '4', '3', ' ', '1', '2', '3', ' ', '1', '4', '3', ' ', '1', '2', '6', ' ', '1', '4', '3', ' ', '1', '2', '7', ' ', '1', '4', '3', ' ', '1', '2', '8', ' ', '1', '4', '3', ' ', '1', '2', '9', ' ', '1', '4', '3', '\n', ' ', '6', '4', ' ', '1', '4', '2', ' ', '8', '9', ' ', '1', '3', '4', ' ', '9', '6', ' ', '9', '8', ' ', '8', '9', ' ', '8', '9', ' ', '1', '4', '2', ' ', '9', '1', ' ', '1', '3', '4', ' ', '9', '5', ' ', '8', '9', ' ', '8', '9', ' ', '8', '9', ' ', '1', '4', '2', ' ', '9', '0', ' ', '1', '3', '4', '\n', ' ', '9', '7', ' ', '9', '0', ' ', '8', '9', ' ', '8', '9', ' ', '1', '4', '3', ' ', '1', ' ', '1', '4', '2', ' ', '9', '1', ' ', '1', '3', '4', ' ', '9', '4', ' ', '9', '4', ' ', '8', '9', ' ', '8', '9', ' ', '1', '4', '2', ' ', '8', '9', ' ', '1', '3', '4', ' ', '9', '6', ' ', '8', '9', '\n', ' ', '8', '9', ' ', '8', '9', ' ', '1', '4', '2', ' ', '8

#### downloaded from the original package where I should be at to check all formats are still good

In [71]:
!python bin/download.py cifs_v1_test.tar.gz

downloading to ./cifs_v1_test.tar.gz ...
100%|█████████████████████████████████████| 1.64M/1.64M [00:00<00:00, 5.42MiB/s]
done!


In [72]:
!python bin/tar_to_pickle.py cifs_v1_test.tar.gz cifs_v1_test.pkl.gz

loading data from cifs_v1_test.tar.gz...
extracting files...: 100%|████████████| 10286/10286 [00:00<00:00, 115485.19it/s]
saving data to cifs_v1_test.pkl.gz...
conversion complete!


In [73]:
!python bin/tokenize_cifs.py \
--train_fname cifs_v1_test.pkl.gz \
--out_dir tokens_v1_train_val/ \
--workers 4

loading data from cifs_v1_test.pkl.gz...
preparing files...: 100%|█████████████| 10286/10286 [00:00<00:00, 143487.29it/s]
tokenizing...: 100%|████████████████████| 10286/10286 [00:02<00:00, 4516.71it/s]
train min tokenized length: 191
train max tokenized length: 4,797
train mean tokenized length: 343.38 +/- 122.25
train total unk counts: 0
concatenating train tokens...: 100%|██| 10286/10286 [00:00<00:00, 513767.65it/s]
encoding...
train has 3,531,957 tokens
vocab size: 372
exporting to .bin files...
creating tar.gz archive...
tarball created at tokens_v1_train_val/tokens_v1_train_val.tar.gz


In [74]:
# reload modules and import
import numpy as np
import pickle
import importlib
from crystallm import _tokenizer

# Reload the CIFTokenizer module to apply changes
importlib.reload(_tokenizer)

# Import the CIFTokenizer class
from crystallm._tokenizer import CIFTokenizer

# Load tokenized data
train_data_path = "tokens_v1_train_val/train.bin"
meta_path = "tokens_v1_train_val/meta.pkl"

# Load the tokenizer and tokenized data
tokenizer = CIFTokenizer()

# Load metadata (stoi, itos)
with open(meta_path, "rb") as f:
    meta = pickle.load(f)

stoi = meta['stoi']  # String to integer token mapping
itos = meta['itos']  # Integer to string token mapping

# Load tokenized train data
train_data = np.fromfile(train_data_path, dtype=np.uint16)
val_data = np.fromfile(val_data_path, dtype=np.uint16)

# Decode part of the train data to check for bandgap token
decoded_train = tokenizer.decode(train_data[:600])  # Decode the first 500 tokens as an example
print("Decoded train data:", decoded_train)

# # Similarly, you can decode part of the val data
# decoded_val = tokenizer.decode(val_data[:500])
# print("Decoded val data:", decoded_val)


Decoded train data: data_Co4B2Os2
loop_
_atom_type_symbol
_atom_type_electronegativity
_atom_type_radius
_atom_type_ionic_radius
Co 1.8800 1.3500 0.7683
B 2.0400 0.8500 0.4100
Os 2.2000 1.3000 0.6730
_symmetry_space_group_name_H-M I-4m2
_cell_length_a 3.6393
_cell_length_b 3.6393
_cell_length_c 6.2877
_cell_angle_alpha 90.0000
_cell_angle_beta 90.0000
_cell_angle_gamma 90.0000
_symmetry_Int_Tables_number 119
_chemical_formula_structural Co2BOs
_chemical_formula_sum 'Co4 B2 Os2'
_cell_volume 83.2785
_cell_formula_units_Z 2
loop_
_symmetry_equiv_pos_site_id
_symmetry_equiv_pos_as_xyz
1 'x, y, z'
loop_
_atom_site_type_symbol
_atom_site_label
_atom_site_symmetry_multiplicity
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
_atom_site_occupancy
Co Co0 2 0.0000 0.0000 0.0000 1
Co Co1 2 0.0000 0.5000 0.7500 1
B B2 2 0.0000 0.5000 0.2500 1
Os Os3 2 0.0000 0.0000 0.5000 1

data_Rb16Ni8O32
loop_
_atom_type_symbol
_atom_type_electronegativity
_atom_type_radius
_atom_type_ionic_radius
Rb 0

#### make sure the check start indices script still works

In [1]:
# Identifying start indices
!python bin/identify_starts.py \
--dataset_fname BG_cifs_more_tokens/BG_cifs_more_all.tar.gz \
--out_fname BG_cifs_more_tokens/starts_v1_train.pkl

identifying starts...: 100%|██████████| 52582/52582 [00:00<00:00, 660951.26it/s]
writing start indices...


In [3]:
# view the start indices
import pickle

# Load the start indices with pickle not gzip
with open("BG_cifs_more_tokens/starts_v1_train.pkl", "rb") as f:
    starts = pickle.load(f)

# Display the first few start indices
starts[:5]


[0, 457, 785, 1144, 2240]

#### data tokenized - all the adapted scriptsd are 'script'_more.py