# Imports and definitions

In [39]:
import pandas
pandas.__version__

'1.5.3'

In [1]:
from XRDXRFutils.notebook_utils import *
from os.path import basename


#path_shared = '/home/shared/'
path_shared = '../../shared/'

path_database = path_shared + 'Database16_42_da_sistemare/'   # tabulated phases


def read_phases(path_database, verbose = False):
    filenames = sorted(glob(path_database + '/*.cif'))
    phases = []
    if filenames:
        for filename in filenames:
            phase = Phase(name = filename)
            phase['filename'] = basename(filename)
            with open(filename, 'r') as f:
                for line in f:
                    x = line.split()
                    if x:
                        y = x[0]

                        if y in ['_chemical_formula_sum', '_chemical_name_mineral', '_chemical_name_common']:
                            value = ' '.join(x[1:]).replace("'", '')
                            value = value.strip()
                            if value != '':
                                phase[y] = value
            phases.append(phase)
    else:
        warnings.warn('No files found')

    if verbose:
        print('Filenames:', len(filenames))
        print('Phases:', len(phases))
    return phases

# Check phases

In [2]:
phases = read_phases(path_database, verbose = True)
filenames_to_fix = []

for phase in phases:
    #if ('_chemical_formula_sum' not in phase.keys()):
    #if ('_chemical_name_mineral' not in phase.keys()):
    if ( # If one of the two names exists and is the same as chemical formula, or if no name exists
        ( ('_chemical_name_common' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and (phase['_chemical_name_common'] == phase['_chemical_formula_sum']) ) or
        ( ('_chemical_name_mineral' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and (phase['_chemical_name_mineral'] == phase['_chemical_formula_sum']) ) or
        ( ('_chemical_name_mineral' not in phase.keys()) and ('_chemical_name_common' not in phase.keys()) )
    ):
#    if ( # If name is a permutation of chemical formula
#        ('_chemical_name_common' in phase.keys()) and ('_chemical_formula_sum' in phase.keys()) and
#        (len(phase['_chemical_name_common']) == len(phase['_chemical_formula_sum'])) and (sorted(phase['_chemical_name_common']) == sorted(phase['_chemical_formula_sum']))
#    ):
        filenames_to_fix.append(phase['filename'])

print(len(filenames_to_fix))
for f in filenames_to_fix:
    print(f)

Filenames: 1337
Phases: 1337
4
Entry_96-231-0420.cif
Entry_96-500-0326.cif
Entry_96-705-2298.cif
Entry_96-901-5197.cif


# View phase database

In [36]:
phases = read_phases(path_database)

df_phases = DataFrame()
df_phases['filename'] = [p['filename'] for p in phases]
df_phases['label'] = [p.label for p in phases]
col_names_old = ['_chemical_name_mineral', '_chemical_name_common', '_chemical_formula_sum']
col_names_new = ['name mineral', 'name common', 'chemical formula']
for col_name_old, col_name_new in zip(col_names_old, col_names_new):
    df_phases[col_name_new] = [p[col_name_old] if (col_name_old in p.keys()) else None for p in phases]
print('Null labels:', df_phases['label'].isna().sum())
display(df_phases)

df_phases_by_label = (
    df_phases
        .groupby('label')[['chemical formula']]
        .aggregate(lambda x: list(set(x)))
)
df_phases_by_label['n. formulas'] = [len(x) for x in df_phases_by_label['chemical formula']]
df_phases_by_label['n. samples'] = df_phases.groupby('label')['filename'].count()
df_phases_by_label.reset_index(inplace = True)
display(df_phases_by_label)

Null labels: 0


Unnamed: 0,filename,label,name mineral,name common,chemical formula
0,AegirineFine_2_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
1,AegirineFine_3_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
2,AegirineFine_4_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
3,AegirineFine_range_16-42_0.cif,V AegirineFine,V AegirineFine,AegirineFine,Si Fe
4,AlizarineCrimsonDark_2_range_16-42_0.cif,V Anthraquinone,V Anthraquinone,AlizarineCrimsonDark,C H O
...,...,...,...,...,...
1332,Yellow_Ochre_iron_oxide_range_16-42_0.cif,V Ochre,V Ochre,Yellow Ochre iron oxide,Fe2 O3
1333,Yellow_lemon_reseda_range_16-42_0.cif,Yellow_lemon_reseda,Yellow_lemon_reseda,Yellow_lemon_reseda,Ba Cr O4
1334,ZincSulfide_range_16-42_0.cif,V Zinc Sulfide,V Zinc Sulfide,Zinc Sulfide,Zn S
1335,ZincWhite_range_16-42_0.cif,V Zinc White,V Zinc White,Zinc White,Zn O


Unnamed: 0,label,chemical formula,n. formulas,n. samples
0,((H2 N)2 (Fe (C O)3)2),[C6 H4 Fe2 N2 O6],1,1
1,(N H4)4 (Fe (C N)6) (H2 O)1.5,[C6 H19 Fe N10 O1.5],1,1
2,(Pb O) (Pb (Cr O4)),[Cr O5 Pb2],1,1
3,4-Ferrocenylmethylene-2-phenyl-4H-oxazol-5-one,[C20 H15 Fe N O2],1,1
4,Akaganeite,"[Cl0.42 Fe2 H2.32 O4, Cl0.675 Fe4 O8]",2,2
...,...,...,...,...
383,magnesium sulfate,[Mg O4 S],1,4
384,shattuckite,[Cu5 H2 O14 Si4],1,1
385,zinc_oxide,[O Zn],1,5
386,"zincian paratacamite, syn","[Cl2 Cu3.67 H6 O6 Zn0.33, Cl2 Cu3.58 H6 O6 Zn0...",2,2


### View phases with same chemical formula and multiple names

In [37]:
df_stats = (
    df_phases
        .groupby('chemical formula')[['label']]
        .aggregate(lambda x: len(set(x)))
)
df_stats = df_stats[df_stats['label'] > 1].reset_index()
print(f'Length: {df_stats.shape[0]}')

idx_start = 30
df_stats.iloc[idx_start : (idx_start + 10)]

Length: 62


Unnamed: 0,chemical formula,label
30,Cl2 Cu,2
31,Co Li O4 P,3
32,Co3 O8 P2,2
33,Cr O4 Pb,2
34,Cr O4 Sr,3
35,Cr O5 Pb2,3
36,Cr2 O3,3
37,Cu,4
38,Cu O4 S,3
39,Cu5 H2 O14 Si4,2


In [38]:
idx_chemical_formula = 31

chemical_formula_sel = df_stats.loc[idx_chemical_formula, 'chemical formula']
print(f'Selected chemical formula:\n{chemical_formula_sel}')

display(df_phases[df_phases['chemical formula'] == chemical_formula_sel].groupby('label')[['filename']].count().reset_index())
display(df_phases[df_phases['chemical formula'] == chemical_formula_sel])

Selected chemical formula:
Co Li O4 P


Unnamed: 0,label,filename
0,Cobalt lithium phosphate,1
1,Lithium cobalt phosphate,1
2,lithium cobalt phosphate,3


Unnamed: 0,filename,label,name mineral,name common,chemical formula
327,Entry_96-230-0247.cif,Lithium cobalt phosphate,,Lithium cobalt phosphate,Co Li O4 P
473,Entry_96-722-6351.cif,lithium cobalt phosphate,,lithium cobalt phosphate,Co Li O4 P
474,Entry_96-722-6352.cif,lithium cobalt phosphate,,lithium cobalt phosphate,Co Li O4 P
475,Entry_96-722-6353.cif,lithium cobalt phosphate,,lithium cobalt phosphate,Co Li O4 P
479,Entry_96-810-3502.cif,Cobalt lithium phosphate,,Cobalt lithium phosphate,Co Li O4 P


In [32]:
df_phases.loc[1123, 'name common']

'Hexa\xadaqua\xadcopper(II) diperchlorate dihydrate'

### View phases with same name and multiple chemical formulas

In [15]:
df_stats = (
    df_phases
        .groupby('label')[['chemical formula']]
        .aggregate(lambda x: len(set(x)))
)
df_stats = df_stats[df_stats['chemical formula'] > 1].reset_index()
print(f'Length: {df_stats.shape[0]}')

idx_start = 0
df_stats.iloc[idx_start : (idx_start + 10)]

Length: 30


Unnamed: 0,label,chemical formula
0,Akaganeite,2
1,Aluminoceladonite,2
2,Anglesite,2
3,Arsenic(III) trioxide,2
4,Bixbyite,4
5,Cadmium carbonate,2
6,Calcite,2
7,Celadonite,4
8,Celestine,4
9,Cerussite,3


In [9]:
label_sel = 'Wuestite'

display(df_phases[df_phases['label'] == label_sel].groupby('chemical formula')[['filename']].count().reset_index())
display(df_phases[df_phases['label'] == label_sel])

Unnamed: 0,chemical formula,filename
0,Fe O,5
1,Fe0.911 O,1
2,Fe0.918 O,1
3,Fe0.929 O,1
4,Fe0.932 O,1
5,Fe0.944 O,2


Unnamed: 0,filename,label,name mineral,name common,chemical formula
137,Entry_96-101-1164.cif,Wuestite,Wuestite,Iron oxide (0.91/1),Fe0.911 O
138,Entry_96-101-1165.cif,Wuestite,Wuestite,Iron oxide (0.92/1),Fe0.918 O
139,Entry_96-101-1166.cif,Wuestite,Wuestite,Iron oxide (0.93/1),Fe0.929 O
140,Entry_96-101-1167.cif,Wuestite,Wuestite,Iron oxide (0.93/1),Fe0.932 O
141,Entry_96-101-1168.cif,Wuestite,Wuestite,Iron oxide (0.94/1),Fe0.944 O
142,Entry_96-101-1169.cif,Wuestite,Wuestite,Iron oxide (0.94/1),Fe0.944 O
143,Entry_96-101-1170.cif,Wuestite,Wuestite,Iron oxide,Fe O
150,Entry_96-101-1199.cif,Wuestite,Wuestite,Iron oxide,Fe O
793,Entry_96-900-8637.cif,Wuestite,Wuestite,,Fe O
832,Entry_96-900-9767.cif,Wuestite,Wuestite,,Fe O
