In [1]:
import collections
import random

import numpy as np
import pandas as pd

import sklearn.model_selection
from xenonpy.descriptor import Compositions

In [2]:
# Read data
data_path = "raw_data/perovskites.pkl"
data = pd.read_pickle(data_path)
data.head()

Unnamed: 0,Formula,Atoms_Object,Volume
0,AgCl3K,"(Atom('K', [2.546784180928994, 2.5467841809289...",132.452162
1,La2N6W2,"(Atom('La', [2.665318694121161, 2.857224133724...",131.78859
2,O3SrV,"(Atom('Sr', [1.9322684339857643, 1.93226843398...",57.715487
3,Cl6Cs2Tl2,"(Atom('Cs', [7.926562986915786, 0.0, 0.0289390...",353.25151
4,F3MnRb,"(Atom('Rb', [2.1409860345006186, 2.14098603450...",78.511177


In [3]:
# Featurize with XenonPy
cal = Compositions(featurizers=["WeightedAverage"])
data["Symbols"] = data.Atoms_Object.apply(lambda atoms: collections.Counter(atoms.get_chemical_symbols()))
featurized_data = pd.concat([data, cal.transform(data.Symbols)], axis=1)

In [4]:
data = featurized_data.drop(columns=["Formula", "Atoms_Object", "Symbols"])

# Train/Test Split
np.random.seed(1234)
random.seed(1234)

train, test = sklearn.model_selection.train_test_split(data, test_size=0.2)
mean = train.mean()
std = train.std()

In [5]:
train_scaled = ((train - mean) / std).dropna(axis=1)
test_scaled = ((test - mean) / std).dropna(axis=1)

In [9]:
mean.to_pickle("dataset_means_stds/perov_mean.pkl")
std.to_pickle("dataset_means_stds/perov_std.pkl")

train_scaled.to_csv("scaled_featurized_train/scaled_perovskite_train.csv")
test_scaled.to_csv("scaled_featurized_test/scaled_perovskite_test.csv")

In [10]:
train

Unnamed: 0,Volume,ave:atomic_number,ave:atomic_radius,ave:atomic_radius_rahm,ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:covalent_radius_cordero,...,ave:num_s_valence,ave:period,ave:specific_heat,ave:thermal_conductivity,ave:vdw_radius,ave:vdw_radius_alvarez,ave:vdw_radius_mm3,ave:vdw_radius_uff,ave:sound_velocity,ave:Polarizability
8,413.931387,20.8,151.812290,217.2,17.100,46.400000,875.390,14.460000,483.26,114.6,...,1.8,3.2,1.186475,30.405340,184.8,200.0,227.0,373.68,1823.600000,7.4300
64,196.123416,28.8,161.414169,191.6,24.400,68.611002,369.152,33.579364,991.72,104.6,...,1.8,3.4,0.845908,13.276620,193.4,200.8,218.2,338.22,2237.549530,10.7862
24,294.298611,22.8,167.468770,198.2,24.720,52.073854,337.814,47.703625,1019.82,111.4,...,1.8,3.2,0.915876,11.705748,191.4,195.0,221.4,382.28,1108.342987,10.9992
22,178.850171,31.2,145.414169,193.8,16.320,74.757842,604.606,45.079364,188.92,92.0,...,2.0,3.4,0.818308,28.616620,171.0,186.8,204.4,345.74,2321.749530,3.3262
73,101.350853,26.2,167.014169,196.8,27.400,60.943732,711.926,44.237949,1466.72,111.4,...,1.8,3.4,0.841308,23.616620,195.4,205.8,224.2,381.44,2374.605124,14.2582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,355.223125,31.2,174.212290,223.0,28.480,71.593090,842.080,12.580000,1533.26,137.8,...,1.8,4.0,0.536875,20.605340,217.0,227.2,244.8,415.00,1059.055594,14.4480
116,69.283838,24.0,146.614169,194.2,16.500,57.136042,669.606,45.679364,233.72,91.4,...,2.0,3.0,0.976908,41.216620,162.0,187.2,203.0,349.20,2780.149530,3.9742
53,191.203661,41.8,181.133185,230.4,31.240,97.487090,860.060,17.298586,1572.80,149.2,...,1.8,4.6,0.782544,23.672000,218.2,229.8,254.8,430.94,2159.023128,15.7540
38,428.467218,48.4,182.933185,231.0,31.760,115.963490,792.060,10.660000,1551.00,150.0,...,1.8,4.8,0.767544,14.272000,220.0,233.2,256.8,427.62,2168.023128,15.1100
