In [1]:
import collections
import pandas as pd
import os

from xenonpy.descriptor import Compositions

In [2]:
# Read data
data_path = "data/perovskites.pkl"
data = pd.read_pickle(data_path)
data.head()

Unnamed: 0,Formula,Atoms_Object,Volume
0,AgCl3K,"(Atom('K', [2.546784180928994, 2.5467841809289...",132.452162
1,La2N6W2,"(Atom('La', [2.665318694121161, 2.857224133724...",131.78859
2,O3SrV,"(Atom('Sr', [1.9322684339857643, 1.93226843398...",57.715487
3,Cl6Cs2Tl2,"(Atom('Cs', [7.926562986915786, 0.0, 0.0289390...",353.25151
4,F3MnRb,"(Atom('Rb', [2.1409860345006186, 2.14098603450...",78.511177


In [3]:
# Featurize with XenonPy
cal = Compositions(featurizers=["WeightedAverage"])
data["Symbols"] = data.Atoms_Object.apply(lambda atoms: collections.Counter(atoms.get_chemical_symbols()))
featurized_data = pd.concat([data, cal.transform(data.Symbols)], axis=1)

In [5]:
data_scaled = featurized_data.drop(columns=["Formula", "Atoms_Object", "Symbols"])

mean = data_scaled.mean()
std = data_scaled.std()
data_scaled = ((data_scaled - mean) / std).dropna(axis=1)
data_scaled

Unnamed: 0,Volume,ave:atomic_number,ave:atomic_radius,ave:atomic_radius_rahm,ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:covalent_radius_cordero,...,ave:num_s_valence,ave:period,ave:specific_heat,ave:thermal_conductivity,ave:vdw_radius,ave:vdw_radius_alvarez,ave:vdw_radius_mm3,ave:vdw_radius_uff,ave:sound_velocity,ave:Polarizability
0,-0.474279,-0.427501,0.133590,0.374631,-0.098436,-0.537008,-0.176743,-0.818983,-0.248242,0.552301,...,-1.514123,0.054815,-0.984724,2.346884,0.390002,0.425232,0.437845,0.086591,-1.310404,0.062042
1,-0.479340,0.332501,-3.339858,0.336125,-1.061550,0.434733,3.315297,2.007024,-0.239394,-0.174658,...,1.432278,0.054815,-1.722028,-0.004043,-0.593642,0.192638,-0.447910,-0.661291,-0.401258,-0.454899
2,-1.044276,-1.122361,-0.401797,-0.048935,-1.057389,-1.119077,0.645951,1.661934,-0.372924,-0.538137,...,1.432278,-0.931858,0.079805,-0.796819,-0.755655,-0.749980,-0.689479,-0.855877,-0.039132,-0.601904
3,1.209702,1.092503,1.077716,0.785361,0.997509,1.122398,-0.702863,-1.228504,0.936077,0.966263,...,-0.040922,1.041489,-1.798664,-0.688763,1.003334,1.269916,1.024513,1.270270,-1.735525,0.878314
4,-0.885673,-1.035503,-0.442337,-0.985913,0.001794,-1.024540,-0.669683,0.276551,0.069673,-0.588621,...,-0.040922,-0.931858,0.794103,-0.798152,-0.327480,-0.407210,-0.839022,-0.929235,1.262681,0.153789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,-0.298743,-0.774931,-1.754512,-0.908901,-1.392415,-0.768633,0.191510,0.813429,-1.560692,-1.376159,...,1.432278,-0.931858,0.628956,-0.635528,-1.600432,-1.362070,-1.540724,-0.712269,1.240163,-1.606089
140,-0.780413,-1.144075,0.549796,-0.626524,0.782482,-1.154106,-0.965409,-0.444486,0.717030,-0.326108,...,-0.040922,-0.931858,1.024785,0.504195,-0.026600,-0.199099,-0.183334,-0.656939,0.907690,0.894991
141,-1.033821,-1.296075,-0.721840,-0.844724,-1.367791,-1.290160,1.100119,1.688672,-1.016222,-0.820843,...,-1.514123,-1.260750,0.980251,0.646468,-0.894522,-1.129476,-0.919545,-1.322137,0.047170,-0.748719
142,-1.010236,-0.427501,0.030261,-0.613689,-0.614155,-0.383389,1.679135,2.226299,-0.177296,-0.366494,...,-0.040922,-0.602967,0.210874,0.585484,-0.327480,-0.701013,-0.470916,-0.779410,-0.539889,0.070879


In [6]:
output_file = "data/scaled_perovskite_data.csv"
data_scaled.to_csv(output_file)