In [2]:
import pandas as pd
import pymatgen.core as pmg

In [8]:
### Import data
df = pd.read_csv('../1_prototype_from_cif/cif_files_prototypes_list.csv', index_col=0)
print(f'{len(df)} entries before filtering')

### Filter entries with >2 species
df['composition'] = [pmg.composition.Composition(formula) for formula in df.Formula]
binary_mask = [len(comp) < 3 for comp in df.composition]
df = df[binary_mask].copy()
print(f'{len(df)} entries after filtering >3 species')

### Ensure stoich is 1:2
partial_occu_mask = [sorted(comp.as_dict().values()) == [1.0, 2.0]  for comp in df.composition]
df = df[partial_occu_mask]
print(f'{len(df)} entries after filtering out incorrect stoich ratio')

### Remove dummy species
real_species_mask = [pmg.periodic_table.DummySpecies not in [type(elem) for elem in comp.elements] for comp in df.composition]
df = df[real_species_mask]
print(f'{len(df)} entries after filtering out incorrect stoich ratio')

df = df.drop(columns=['composition'])
df

13918 entries before filtering
9041 entries after filtering >3 species
8829 entries after filtering out incorrect stoich ratio
8758 entries after filtering out incorrect stoich ratio


Unnamed: 0,Collection_Code,Formula,Prototype
1,159910,TiO2,Anatase#TiO2
3,180903,SiO2,Cristobalite#SiO2
4,246888,MnO2,Rutile#TiO2
8,108587,MgZn2,Laves(2H)#MgZn2
9,89278,SiO2,Quartz(low)#SiO2
...,...,...,...
14672,56165,Ag2F,Ag2F
14673,88619,SbO2,HgMoO4
14674,638612,HfMo2,Laves(cub)#MgCu2
14675,99714,MoO2,Rutile#TiO2


In [9]:
### Write
df.to_csv('filtered_entries_for_featurization.csv', index=False)