# Imports and definitions

In [1]:
from XRDXRFutils.notebook_utils import *
from os.path import basename


#path_shared = '/home/shared/'
path_shared = '../../shared/'

path_database = path_shared + 'Database16_42_da_sistemare/'   # tabulated phases


def read_phases(path_database, verbose = False):
    filenames = sorted(glob(path_database + '/*.cif'))
    phases = []
    if filenames:
        for filename in filenames:
            phase = Phase(name = filename)
            phase['filename'] = basename(filename)
            with open(filename, 'r') as f:
                for line in f:
                    x = line.split()
                    if x:
                        y = x[0]

                        if y in ['_chemical_formula_sum', '_chemical_name_mineral', '_chemical_name_common']:
                            value = ' '.join(x[1:]).replace("'", '')
                            value = value.strip()
                            if value != '':
                                phase[y] = value
            phases.append(phase)
    else:
        warnings.warn('No files found')

    if verbose:
        print('Filenames:', len(filenames))
        print('Phases:', len(phases))
    return phases

# Check phases

In [2]:
phases = read_phases(path_database, verbose = True)
filenames_to_fix = []

for phase in phases:
    #if ('_chemical_formula_sum' not in phase.keys()):
    #if ('_chemical_name_mineral' not in phase.keys()):
    if ( # If one of the two names exists and is the same as chemical formula, or if no name exists
        ( ('_chemical_name_common' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and (phase['_chemical_name_common'] == phase['_chemical_formula_sum']) ) or
        ( ('_chemical_name_mineral' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and (phase['_chemical_name_mineral'] == phase['_chemical_formula_sum']) ) or
        ( ('_chemical_name_mineral' not in phase.keys()) and ('_chemical_name_common' not in phase.keys()) )
    ):
#    if ( # If name is a permutation of chemical formula
#        ('_chemical_name_common' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and
#        (len(phase['_chemical_name_common']) == len(phase['_chemical_formula_sum'])) and (sorted(phase['_chemical_name_common']) == sorted(phase['_chemical_formula_sum']))
#    ):
        filenames_to_fix.append(phase['filename'])

print(len(filenames_to_fix))
for f in filenames_to_fix:
    print(f)

Filenames: 1337
Phases: 1337
4
Entry_96-231-0420.cif
Entry_96-500-0326.cif
Entry_96-705-2298.cif
Entry_96-901-5197.cif


# View phase database

In [157]:
phases = read_phases(path_database)

df_phases = DataFrame()
df_phases['filename'] = [p['filename'] for p in phases]
df_phases['label'] = [p.label for p in phases]
col_names_old = ['_chemical_name_mineral', '_chemical_name_common', '_chemical_formula_sum']
col_names_new = ['name mineral', 'name common', 'chemical formula']
for col_name_old, col_name_new in zip(col_names_old, col_names_new):
    df_phases[col_name_new] = [p[col_name_old] if (col_name_old in p.keys()) else None for p in phases]
print('Null labels:', df_phases['label'].isna().sum())
display(df_phases)

df_phases_by_label = (
    df_phases
        .groupby('label')[['chemical formula']]
        .aggregate(lambda x: list(set(x)))
)
df_phases_by_label['n. formulas'] = [len(x) for x in df_phases_by_label['chemical formula']]
df_phases_by_label['n. samples'] = df_phases.groupby('label')['filename'].count()
df_phases_by_label.reset_index(inplace = True)
display(df_phases_by_label)

Null labels: 0


Unnamed: 0,filename,label,name mineral,name common,chemical formula
0,AegirineFine_2_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
1,AegirineFine_3_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
2,AegirineFine_4_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
3,AegirineFine_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
4,AlizarineCrimsonDark_2_range_16-42_0.cif,V Anthraquinone,V Anthraquinone,AlizarineCrimsonDark,C H O
...,...,...,...,...,...
1332,Yellow_Ochre_iron_oxide_range_16-42_0.cif,V Ochre,V Ochre,Yellow Ochre iron oxide,Fe2 O3
1333,Yellow_lemon_reseda_range_16-42_0.cif,Yellow_lemon_reseda,Yellow_lemon_reseda,Yellow_lemon_reseda,Ba Cr O4
1334,ZincSulfide_range_16-42_0.cif,V Zinc Sulfide,V Zinc Sulfide,Zinc Sulfide,Zn S
1335,ZincWhite_range_16-42_0.cif,V Zinc White,V Zinc White,Zinc White,Zn O


Unnamed: 0,label,chemical formula,n. formulas,n. samples
0,((H2 N)2 (Fe (C O)3)2),[C6 H4 Fe2 N2 O6],1,1
1,(N H4)4 (Fe (C N)6) (H2 O)1.5,[C6 H19 Fe N10 O1.5],1,1
2,4-Ferrocenylmethylene-2-phenyl-4H-oxazol-5-one,[C20 H15 Fe N O2],1,1
3,Akaganeite,"[Cl0.42 Fe2 H2.32 O4, Cl0.675 Fe4 O8]",2,2
4,Alum-K,[Al H24 K O20 S2],1,1
...,...,...,...,...
359,"bis(mu!4$-N,N-Diethylcarbamato-O,O,O,O)-tetrak...",[C60 H120 Fe6 N12 O24],1,1
360,eta- Al-oxid_fit2gnn,[Al2 O3],1,1
361,lithuim chloro berylloarsenate sodalite,[As3 Be3 Cl Li4 O12],1,1
362,lithuim chloro beryllophosphate sodalite,[Be3 Cl Li4 O12 P3],1,1


### View phases with same chemical formula and multiple names

In [139]:
df_stats = (
    df_phases
        .groupby('chemical formula')[['label']]
        .aggregate(lambda x: len(set(x)))
)
df_stats = df_stats[df_stats['label'] > 1].reset_index()
print(f'Length: {df_stats.shape[0]}')

idx_start = 50
df_stats.iloc[idx_start : (idx_start + 10)]

Length: 54


Unnamed: 0,chemical formula,label
50,O2 Ti,3
51,O4 Pb S,2
52,S Zn,7
53,S2 Sn,2


In [140]:
idx_chemical_formula = 53

chemical_formula_sel = df_stats.loc[idx_chemical_formula, 'chemical formula']
print(f'Selected chemical formula:\n{chemical_formula_sel}')

display(df_phases[df_phases['chemical formula'] == chemical_formula_sel].groupby('label')[['filename']].count().reset_index())
display(df_phases[df_phases['chemical formula'] == chemical_formula_sel])

Selected chemical formula:
S2 Sn


Unnamed: 0,label,filename
0,Berndtite,1
1,Berndtite-2T,4


Unnamed: 0,filename,label,name mineral,name common,chemical formula
164,Entry_96-101-1331.cif,Berndtite,Berndtite,Tin(IV) sulfide,S2 Sn
504,Entry_96-900-0614.cif,Berndtite-2T,Berndtite-2T,,S2 Sn
505,Entry_96-900-0615.cif,Berndtite-2T,Berndtite-2T,,S2 Sn
506,Entry_96-900-0616.cif,Berndtite-2T,Berndtite-2T,,S2 Sn
805,Entry_96-900-9121.cif,Berndtite-2T,Berndtite-2T,,S2 Sn


### View phases with same name and multiple chemical formulas

In [146]:
df_stats = (
    df_phases
        .groupby('label')[['chemical formula']]
        .aggregate(lambda x: len(set(x)))
)
df_stats = df_stats[df_stats['chemical formula'] > 1].reset_index()
print(f'Length: {df_stats.shape[0]}')

idx_start = 0
df_stats.iloc[idx_start : (idx_start + 10)]

Length: 28


Unnamed: 0,label,chemical formula
0,Akaganeite,2
1,Aluminoceladonite,2
2,Arsenic(III) trioxide,2
3,Bixbyite,4
4,Calcite,2
5,Celadonite,4
6,Celestine,4
7,Cerussite,3
8,Clinoatacamite,3
9,Conichalcite,3


In [149]:
idx_label = 6

label_sel = df_stats.loc[idx_label, 'label']
print(f'Selected label:\n{label_sel}')

display(df_phases[df_phases['label'] == label_sel].groupby('chemical formula')[['filename']].count().reset_index())
display(df_phases[df_phases['label'] == label_sel])

Selected label:
Celestine


Unnamed: 0,chemical formula,filename
0,Ba0.001 Fe0.001 O4 S Sr0.966,1
1,Ba0.13 O4 S Sr0.87,1
2,O4 S Sr,6
3,S Sr,1


Unnamed: 0,filename,label,name mineral,name common,chemical formula
445,Entry_96-500-0043.cif,Celestine,Celestine,Strontium sulfate,O4 S Sr
457,Entry_96-600-0387.cif,Celestine,Celestine,Strontium Sulfate,O4 S Sr
507,Entry_96-900-0652.cif,Celestine,Celestine,,O4 S Sr
578,Entry_96-900-4092.cif,Celestine,Celestine,,O4 S Sr
590,Entry_96-900-4484.cif,Celestine,Celestine,,O4 S Sr
762,Entry_96-900-8107.cif,Celestine,Celestine,,O4 S Sr
809,Entry_96-900-9507.cif,Celestine,Celestine,,Ba0.13 O4 S Sr0.87
1099,Entry_96-901-6566.cif,Celestine,Celestine,,S Sr
1100,Entry_96-901-6594.cif,Celestine,Celestine,,Ba0.001 Fe0.001 O4 S Sr0.966
