In [1]:
import collections
import random

import numpy as np
import pandas as pd

import sklearn.model_selection
from xenonpy.descriptor import Compositions

In [2]:
# Read data
data_path = "raw_data/perovskites.pkl"
data = pd.read_pickle(data_path)
data.head()

Unnamed: 0,Formula,Atoms_Object,Volume
0,AgCl3K,"(Atom('K', [2.546784180928994, 2.5467841809289...",132.452162
1,La2N6W2,"(Atom('La', [2.665318694121161, 2.857224133724...",131.78859
2,O3SrV,"(Atom('Sr', [1.9322684339857643, 1.93226843398...",57.715487
3,Cl6Cs2Tl2,"(Atom('Cs', [7.926562986915786, 0.0, 0.0289390...",353.25151
4,F3MnRb,"(Atom('Rb', [2.1409860345006186, 2.14098603450...",78.511177


In [3]:
data["Volume"] /= data["Atoms_Object"].apply(lambda atoms: len(atoms)//5)

# Featurize with XenonPy
cal = Compositions(featurizers=["WeightedAverage"])
data["Symbols"] = data.Atoms_Object.apply(lambda atoms: collections.Counter(atoms.get_chemical_symbols()))
featurized_data = pd.concat([data, cal.transform(data.Symbols)], axis=1)

In [4]:
data = featurized_data.drop(columns=["Formula", "Atoms_Object", "Symbols"])

# Train/Test Split
np.random.seed(1234)
random.seed(1234)

train, test = sklearn.model_selection.train_test_split(data, test_size=0.2)
mean = train.mean()
std = train.std()

In [5]:
train_scaled = ((train - mean) / std).dropna(axis=1)
test_scaled = ((test - mean) / std).dropna(axis=1)

In [6]:
mean.to_pickle("dataset_means_stds/perov_mean.pkl")
std.to_pickle("dataset_means_stds/perov_std.pkl")

train_scaled.to_csv("scaled_featurized_train/scaled_perovskite_train.csv")
test_scaled.to_csv("scaled_featurized_test/scaled_perovskite_test.csv")