In [4]:
from pymatgen.io import cif
import pymatgen.core as mg
import glob
from tqdm import tqdm
import pandas as pd

# read cif files and extract prototype information

In [None]:
cif_file = cif.CifParser(cif_path)
cf = cif_file.as_dict()
cif_dict = cf[list(cf.keys())[0]]

In [2]:
cif_dir = '../0_retrieve_icsd_data/final_cifs_for_aflow'

prototype_list = []
collection_code_list = []
formula_list = []


for cif_path in tqdm(glob.glob(f'{cif_dir}/*')):

    cif_file = cif.CifParser(cif_path)
    cf = cif_file.as_dict()
    cif_dict = cf[list(cf.keys())[0]]

    try: prototype = cif_dict['_chemical_name_structure_type']
    except: prototype = 'ERROR'
    try: collection_code = int(cif_dict['_database_code_ICSD'])
    except: collection_code = -1
    try: formula = mg.Structure.from_file(cif_path).composition.reduced_formula
    except: formula = 'ERROR'

    formula_list += [formula]
    collection_code_list += [collection_code]
    prototype_list += [prototype]

_database_code_ICSD   86611
_audit_creation_date   2000-07-15
_audit_update_record   2008-08-01
_chemical_name_common
'Zirconium yttrium tin oxide (0.88/0.04/0.08/2.02)'
_chemical_formula_structural   '(Zr0.88 Y0.04 Sn0.08) O2.024'
_chemical_formula_sum   'O2.024 Sn0.08 Y0.04 Zr0.88'
_chemical_name_structure_type   Zirconia#ZrO2(HT)
_exptl_crystal_density_diffrn   6.18
_citation_title
;
Neutron diffraction study of tetragonal zirconias containing
tetravalent dopants
;
loop_
 _citation_id
 _citation_journal_full
 _citation_year
 _citation_journal_volume
 _citation_page_first
 _citation_page_last
 _citation_journal_id_ASTM
  primary  'Australian Journal of Physics'  1998  51  539  545
  AUJPAS
loop_
 _citation_author_citation_id
 _citation_author_name
  primary  'Hunter, B.A.'
  primary  'Howard, C.J.'
  primary  'Kim, D.J.'
_cell_length_a   3.6031(1)
_cell_length_b   3.6031(1)
_cell_length_c   5.2029(2)
_cell_angle_alpha   90.
_cell_angle_beta   90.
_cell_angle_gamma   90.
_cell_volume 

# write results to dataframe

In [15]:
results = pd.DataFrame(data = {'Collection Code': collection_code_list,
                                'Formula': formula_list,
                                'Prototype': prototype_list})
results.to_csv('cif_files_prototypes_list.csv')

# get some stats about errors 

In [11]:
print(f'There are {len(results)} entries.')

# filter out errors
non_error_results = results[(results['Collection Code'] != -1) & (results['Formula'] != 'ERROR') & (results['Prototype'] != 'ERROR')]

print(f'There are {len(non_error_results)} entries with no errors.')

There are 14679 entries.
There are 13918 entries with no errors.


# analysis

In [14]:
non_error_results['Prototype'].value_counts().head(10)

Laves(cub)#MgCu2     2580
Laves(2H)#MgZn2      1244
Fluorite#CaF2         868
AlB2                  746
Rutile#TiO2           671
Pyrite#FeS2(cP12)     387
CeCu2                 345
CuAl2                 317
ThSi2                 254
CdI2(hP3)             254
Name: Prototype, dtype: int64