#### create .cif file from .json file > build cif dataset from json files

In [16]:
import yaml
import os
import json

import pandas as pd
import numpy as np
import tensorflow as tf

from pathlib import Path
from pymatgen.io import cif
from pymatgen.core import Structure



In [17]:
import os
files = os.listdir("../data/dichalcogenides_private/structures")

In [19]:
file_ids = [i.split('.')[0] for i in files]
label_col = [0,]*len(file_ids)
df = pd.DataFrame(data={'_id':file_ids, 'targets':label_col})
df.head()

Unnamed: 0,_id,targets
0,6141cf0efbfd4bd9ab2c2f7e,0
1,6141cf0fe689ecc4c43cdd4b,0
2,6141cf10b842c2e72e2f2d44,0
3,6141cf10b842c2e72e2f2d46,0
4,6141cf1302d926221cabc549,0


In [4]:
df.to_csv('../data/dichalcogenides_private/targets.csv', index=False)

In [5]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)

In [6]:
def prepare_dataset(dataset_path):
    dataset_path = Path(dataset_path)
    targets = pd.read_csv(dataset_path / "targets.csv", index_col=0)
    struct = {
        item.name.strip(".json"): read_pymatgen_dict(item)
        for item in (dataset_path / "structures").iterdir()
    }

    data = pd.DataFrame(columns=["structures"], index=struct.keys())
    data = data.assign(structures=struct.values())

    return data #train_test_split(data, test_size=0.25, random_state=666)

In [7]:
sample_df = prepare_dataset('../data/dichalcogenides_private')  ## changable param

In [8]:
sample_df.head()

Unnamed: 0,structures
6141cf0efbfd4bd9ab2c2f7e,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141cf0fe689ecc4c43cdd4b,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141cf10b842c2e72e2f2d44,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141cf10b842c2e72e2f2d46,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141cf1302d926221cabc549,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...


In [9]:
sample_df.index.values

array(['6141cf0efbfd4bd9ab2c2f7e', '6141cf0fe689ecc4c43cdd4b',
       '6141cf10b842c2e72e2f2d44', ..., '6149c48031cf3ef3d4a9f84a',
       '6149f3853ac25c70a5c6ce01', '615083823ac25c70a5c6ce03'],
      dtype=object)

In [10]:
def save_cif(struct, sid):
    cif_root = '../data/dichalcogenides_private/cifs'  ## changable param
    cif_obj = cif.CifWriter(struct)
    cif_obj.write_file(cif_root+'/'+sid+'.cif')

vsave_cif = np.vectorize(save_cif)

In [11]:
vsave_cif(sample_df['structures'].values, sample_df.index.values)

array([None, None, None, ..., None, None, None], dtype=object)