In [1]:
# importando pacotes
from pymatgen.core.composition import *
import numpy as np
import pandas as pd
import ase.db
import json
import re

In [2]:
# Lendo o arquivo com as propriedades atômicas
df_atoms = pd.read_csv('../datasets/project2/Schleder2019_AtomicTable.csv')
df_atoms

Unnamed: 0,Element,Z,Electronegativity,IonizationPotential,ElectronAffinity,HOMO,LUMO,r_s_orbital,r_p_orbital,r_d_orbital,r_atomic_nonbonded,r_valence_lastorbital,r_covalent,Valence,PeriodicColumn,PeriodicColumn_upto18,NumberUnfilledOrbitals,Polarizability
0,H,1,2.20,-12.6833,-1.5273,-6.4925,0.7250,0.3865,,,0.37,0.3865,0.31,1.0,1.0,1.0,1.0,4.507107
1,He,2,,-26.7499,3.0204,-15.7610,1.5714,0.2964,1.0292,0.4176,0.32,0.2964,0.28,2.0,8.0,18.0,0.0,1.383746
2,Li,3,0.98,-5.3606,-0.5863,-2.8744,-0.9074,1.6578,1.8874,2.0869,1.34,1.6578,1.28,1.0,1.0,1.0,1.0,164.000000
3,Be,4,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,1.2128,1.9594,0.90,1.0805,0.96,2.0,2.0,2.0,0.0,37.710000
4,B,5,2.04,-8.1261,0.0312,-3.6067,2.4547,0.8025,0.8348,1.3619,0.82,0.8348,0.84,3.0,3.0,13.0,5.0,20.530000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,Fl,114,,,,,,,,,,,,,,,,30.590000
114,Mc,115,,,,,,,,,,,,,,,,
115,Lv,116,,,,,,,,,,,,,,,,
116,Ts,117,,,,,,,,,,,,,,,,


In [3]:
## Transformando o dataframe em dicionário em que os elementos são as chaves. 
df_atoms.set_index('Element', inplace = True)
dicio = df_atoms.to_dict('index')

In [4]:
# All properties in the atomic table
prop = ['Z',
        'Electronegativity',
        'IonizationPotential',
        'ElectronAffinity',
        'HOMO',
        'LUMO',
        'r_s_orbital',
        'r_p_orbital',
        'r_d_orbital',
        'r_atomic_nonbonded',
        'r_valence_lastorbital',
        'r_covalent',
        'Valence',
        'PeriodicColumn',
        'PeriodicColumn_upto18',
        'NumberUnfilledOrbitals',
        'Polarizability']

In [5]:
# conectado à base de dados 
data = ase.db.connect('../datasets/project2/c2db-2021-06-24.db')


#selecionando materiais não metálicos e não magnéticos
rows = data.select(is_magnetic=False)

## Listas que guardarão cada propriedade de cada elemento no composto por vez. ##
lista = []
pesos = []
stch = []
## Dicionário com as features estatísticas de todas as propriedades para cada material##
media_interm = {}

## Lista que guarda cada dicionário de cada material para levar para um dataframe ##
lista_completa = []


for row in rows:
    
    try:
        comp = Composition(row.formula).as_dict()
        elem = list(comp.items())
        
        ## Acrescentando a fórmula química ##
        media_interm['Material'] = row.formula
        
        ## Acrescentando o grupo espacial ##
        media_interm['Space group'] = row.spacegroup
        
        media_interm['Crystal Type'] = row.crystal_type
        
        ## Acrescentando o gap ##
        media_interm['Band gap'] = row.gap
        
        media_interm['stoichiometry'] = row.stoichiometry
    
        for i in prop:
            ## Lista com a propriedade de cada átomo ##
            for m in range(0, len(elem)):
                lista.append(dicio[elem[m][0]][i])
                pesos.append(elem[m][1])
                if (len(elem)==2):
                    stch.append(row.stoichiometry)
        
                
            
            ## Valor médio ##
            media_interm[f'media_{i}'] = np.mean(lista)

    
            ## Média ponderada ##
            avg = np.average(lista,weights=pesos)
            media_interm[f'media_pon_{i}'] = avg
    
            ## Valor máximo e mínimo ##
            max_prop = max(lista)
            min_prop = min(lista)
            media_interm[f'max_{i}'] = max_prop
            media_interm[f'min_{i}'] = min_prop
    
            ## Desvio padrão em relação a média ##
            media_interm[f'desvio_{i}'] = np.std(lista)
    
            ## Desvio padrão em relação a média ponderada ##
            sum_prop = 0
            for j in lista:
                sub2 = (j - avg)**2
                sum_prop = sum_prop + sub2
            media_interm[f'desvio_pon_{i}'] = np.sqrt(sum_prop/len(lista)) 
        
            lista.clear()
            pesos.clear()
        
        lista_completa.append(media_interm.copy())
    except:
        pass

print(set(stch))
    
print(len(lista_completa))
df = pd.DataFrame(lista_completa)
df.sample(20, random_state=100)       

{'AB2', 'A2B5', 'AB5', 'AB', 'AB4', 'AB12', 'AB3', 'A2B3', 'A3B4'}
3227


Unnamed: 0,Material,Space group,Crystal Type,Band gap,stoichiometry,media_Z,media_pon_Z,max_Z,min_Z,desvio_Z,...,max_NumberUnfilledOrbitals,min_NumberUnfilledOrbitals,desvio_NumberUnfilledOrbitals,desvio_pon_NumberUnfilledOrbitals,media_Polarizability,media_pon_Polarizability,max_Polarizability,min_Polarizability,desvio_Polarizability,desvio_pon_Polarizability
533,HgH2S2,P1,AB2C2-1-a,2.352384,AB2C2,32.333333,22.8,80,1,34.257197,...,2.0,0.0,0.816497,0.840635,19.382369,16.404843,34.27,4.507107,12.150653,12.510157
2104,TlI2,P-3m1,AB2-164-bd,0.0,AB2,67.0,62.333333,81,53,14.0,...,5.0,1.0,2.0,2.108185,43.1,40.266667,51.6,34.6,8.5,8.959787
2452,Hg3B2O6,P1,A2B3C6-1-a,3.521617,A2B3C6,31.0,27.090909,80,5,34.669872,...,5.0,0.0,2.054805,2.081666,20.013333,15.937273,34.27,5.24,11.857078,12.538124
480,TaSe2,P-6m2,AB2-187-bi,0.0,AB2,53.5,47.0,73,34,19.5,...,7.0,2.0,2.5,2.635231,57.12,46.826667,88.0,26.24,30.88,32.550378
3223,Hf2Zr2Te8,P1,ABC4-1-a,0.165737,ABC4,54.666667,53.333333,72,40,13.199327,...,8.0,2.0,2.828427,3.464102,89.0,63.0,121.0,37.0,37.094474,45.299007
828,Bi2Cu2S4,Pmc2_1,ABC2-26-ab,0.13558,ABC2,42.666667,36.0,83,16,29.009577,...,3.0,1.0,0.816497,0.816497,40.936667,35.545,53.44,19.37,15.314464,16.235852
1354,AgSr2Br2O2,Amm2,AB2C2D2-38-bce,0.0,AB2C2D2,32.0,29.857143,47,8,14.54304,...,2.0,0.0,0.707107,0.707107,69.185,71.568571,197.2,5.24,75.829629,75.867082
1689,Na2B2H8O8,P-1,ABC4D4-2-i,4.681448,ABC4D4,6.25,5.2,11,1,3.699662,...,5.0,1.0,1.63936,1.7,48.244277,22.221843,162.7,4.507107,66.389949,71.307731
29,Au2CaF12,P-4m2,AB2C12-115-dgl,1.566427,AB12C2,36.0,19.066667,79,9,30.73543,...,1.0,0.0,0.471405,0.541603,66.856667,18.491333,160.77,3.7,67.711277,83.210712
105,V3C2H2S2,Pm,A2B2C2D3-6-ac,0.0,A2B2C2D3,11.5,12.777778,23,1,8.558621,...,7.0,1.0,2.291288,2.324056,29.034277,34.808246,81.0,4.507107,30.460391,31.002808


In [6]:
df.to_csv('../datasets/project2/dataset_full.csv',index=None)


Criando o dataset de produção

In [7]:
STCH=['A2B3']
PROT=['P-3m1','P-6m2','Pmmn','P1',]

TM=['Sc','Ti',]
HL=['F','Cl',]



elem=list(Composition(STCH[0]).as_dict().items())
n=0
new = {}
lista = []

for i in range(len(STCH)):
    elem=list(Composition(STCH[i]).as_dict().items())
    for j in range(len(TM)):
        for k in range(len(HL)):
            for l in range(len(PROT)):
                
                if(int(elem[0][1])==1):
                    if(int(elem[1][1])==1):
                        new['Material']=("%s%s"%(TM[j],HL[k]))
                    else:
                        new['Material']=("%s%s%s"%(TM[j],HL[k],str(int(elem[1][1]))))
                    
                    
                else:
                    if(int(elem[1][1])==1):
                        new['Material']=("%s%s%s%s"%(TM[j],str(int(elem[0][1])),HL[k]))
                    else:
                        new['Material']=("%s%s%s%s"%(TM[j],str(int(elem[0][1])),HL[k],str(int(elem[1][1]))))
                
                
                new['Prototype']=(PROT[l])
                lista.append(new.copy())
                n+=1
df2 = pd.DataFrame(lista)
df2.sample(10, random_state=100)            
df2

Unnamed: 0,Material,Prototype
0,Sc2F3,P-3m1
1,Sc2F3,P-6m2
2,Sc2F3,Pmmn
3,Sc2F3,P1
4,Sc2Cl3,P-3m1
5,Sc2Cl3,P-6m2
6,Sc2Cl3,Pmmn
7,Sc2Cl3,P1
8,Ti2F3,P-3m1
9,Ti2F3,P-6m2


In [8]:
## Listas que guardarão cada propriedade de cada elemento no composto por vez. ##
lista = []
pesos = []

## Dicionário com as features estatísticas de todas as propriedades para cada material##
media_interm = {}

## Lista que guarda cada dicionário de cada material para levar para um dataframe ##
lista_completa = []


for i in range(0,100000):
    try:
        formula = df2.iloc[i]['Material']
        comp = Composition(formula).as_dict()
        elem = list(comp.items())
        
        ## Acrescentando a fórmula química ##
        media_interm['Material'] = formula
        
        ## Acrescentando o grupo espacial ##
        media_interm['Prototype'] = df2.iloc[i]['Prototype']
    
        for i in prop:
            ## Lista com a propriedade de cada átomo ##
            for m in range(0, len(elem)):
                lista.append(dicio[elem[m][0]][i])
                pesos.append(elem[m][1])
            
            ## Valor médio ##
            media_interm[f'media_{i}'] = np.mean(lista)

    
            ## Média ponderada ##
            avg = np.average(lista,weights=pesos)
            media_interm[f'media_pon_{i}'] = avg
    
            ## Valor máximo e mínimo ##
            max_prop = max(lista)
            min_prop = min(lista)
            media_interm[f'max_{i}'] = max_prop
            media_interm[f'min_{i}'] = min_prop
    
            ## Desvio padrão em relação a média ##
            media_interm[f'desvio_{i}'] = np.std(lista)
    
            ## Desvio padrão em relação a média ponderada ##
            sum_prop = 0
            for j in lista:
                sub2 = (j - avg)**2
                sum_prop = sum_prop + sub2
            media_interm[f'desvio_pon_{i}'] = np.sqrt(sum_prop/len(lista)) 
        
            lista.clear()
            pesos.clear()
        
        lista_completa.append(media_interm.copy())
    except:
        pass

df_new = pd.DataFrame(lista_completa)
df_new.sample(10, random_state=100)

Unnamed: 0,Material,Prototype,media_Z,media_pon_Z,max_Z,min_Z,desvio_Z,desvio_pon_Z,media_Electronegativity,media_pon_Electronegativity,...,max_NumberUnfilledOrbitals,min_NumberUnfilledOrbitals,desvio_NumberUnfilledOrbitals,desvio_pon_NumberUnfilledOrbitals,media_Polarizability,media_pon_Polarizability,max_Polarizability,min_Polarizability,desvio_Polarizability,desvio_pon_Polarizability
13,Ti2Cl3,P-6m2,19.5,19.0,22,17,2.5,2.54951,2.35,2.512,...,8.0,1.0,3.5,3.569314,53.285,45.542,92.0,14.57,38.715,39.481708
9,Ti2F3,P-6m2,15.5,14.2,22,9,6.5,6.628725,2.76,3.004,...,8.0,1.0,3.5,3.569314,47.85,39.02,92.0,3.7,44.15,45.024342
10,Ti2F3,Pmmn,15.5,14.2,22,9,6.5,6.628725,2.76,3.004,...,8.0,1.0,3.5,3.569314,47.85,39.02,92.0,3.7,44.15,45.024342
1,Sc2F3,P-6m2,15.0,13.8,21,9,6.0,6.118823,2.67,2.932,...,9.0,1.0,4.0,4.079216,55.35,45.02,107.0,3.7,51.65,52.672872
11,Ti2F3,P1,15.5,14.2,22,9,6.5,6.628725,2.76,3.004,...,8.0,1.0,3.5,3.569314,47.85,39.02,92.0,3.7,44.15,45.024342
6,Sc2Cl3,Pmmn,19.0,18.6,21,17,2.0,2.039608,2.26,2.44,...,9.0,1.0,4.0,4.079216,60.785,51.542,107.0,14.57,46.215,47.130237
14,Ti2Cl3,Pmmn,19.5,19.0,22,17,2.5,2.54951,2.35,2.512,...,8.0,1.0,3.5,3.569314,53.285,45.542,92.0,14.57,38.715,39.481708
5,Sc2Cl3,P-6m2,19.0,18.6,21,17,2.0,2.039608,2.26,2.44,...,9.0,1.0,4.0,4.079216,60.785,51.542,107.0,14.57,46.215,47.130237
2,Sc2F3,Pmmn,15.0,13.8,21,9,6.0,6.118823,2.67,2.932,...,9.0,1.0,4.0,4.079216,55.35,45.02,107.0,3.7,51.65,52.672872
4,Sc2Cl3,P-3m1,19.0,18.6,21,17,2.0,2.039608,2.26,2.44,...,9.0,1.0,4.0,4.079216,60.785,51.542,107.0,14.57,46.215,47.130237


In [9]:
## exportando para csv ##
df_new.to_csv('../datasets/project2/dataset_producao.csv',index=None)