In [17]:
# Import Libraries

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from CBFV import composition

In [2]:
# Load the preprocessed data

df = pd.read_csv("../../data/d2_data_preprocessing/preprocessed_data.csv")

In [5]:
# Rename Columns

df.rename(columns={'Formula': 'formula', 'Is Metal': 'target'}, inplace=True)
df.head()

Unnamed: 0,formula,target
0,Cs3Nd(MoO4)3,False
1,Ba(PdS2)2,False
2,Na7Ta3Zn(SiO4)6,False
3,TiZnBiO5,True
4,Na2YNb3Si4(SO12)2,False


In [6]:
# Encoding Target

le = preprocessing.LabelEncoder()
df.target = le.fit_transform(df.target)

In [7]:
# Split the data

# Split the data into training (70%) and temporary set (30%)
train, test_val = train_test_split(df, test_size=0.3, random_state=42)

# Split the temporary set into testing (50%) and validation (50%)
test, val = train_test_split(test_val, test_size=0.5, random_state=42)

# Check the shape of each set
print("Training set Shape:", train.shape)
print("Testing set Shape:", test.shape)
print("Validation set Shape:", val.shape)

# Save the test and val into separate CSV files
train.to_csv("../../data/d3_data_featurization/train.csv", index=False)
test.to_csv("../../data/d3_data_featurization/test.csv", index=False)
val.to_csv("../../data/d3_data_featurization/val.csv", index=False)


Training set Shape: (72826, 2)
Testing set Shape: (15606, 2)
Validation set Shape: (15606, 2)


In [8]:
# Featurizing

X_train_unscaled, y_train, formulas_train, skipped_train = composition.generate_features(train, elem_prop='magpie', drop_duplicates=False, extend_features=True, sum_feat=True)

X_val_unscaled, y_val, formulas_val, skipped_val = composition.generate_features(val, elem_prop='magpie', drop_duplicates=False, extend_features=True, sum_feat=True)

X_test_unscaled, y_test, formulas_test, skipped_test = composition.generate_features(test, elem_prop='magpie', drop_duplicates=False, extend_features=True, sum_feat=True)

Processing Input Data: 100%|██████████| 72826/72826 [00:03<00:00, 18917.97it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 72826/72826 [00:07<00:00, 10175.00it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 15606/15606 [00:00<00:00, 18665.56it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 15606/15606 [00:01<00:00, 10063.09it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 15606/15606 [00:01<00:00, 11101.31it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 15606/15606 [00:01<00:00, 10212.57it/s]


	Creating Pandas Objects...


In [16]:
# Save the featurized data

X_train_unscaled.to_csv('../../data/d3_data_featurization/train_featurized.csv', index=False)
X_val_unscaled.to_csv('../../data/d3_data_featurization/val_featurized.csv', index=False)
X_test_unscaled.to_csv('../../data/d3_data_featurization/test_featurized.csv', index=False)
y_train.to_csv('../../data/d3_data_featurization/train_labels.csv', index=False)
y_val.to_csv('../../data/d3_data_featurization/val_labels.csv', index=False)
y_test.to_csv('../../data/d3_data_featurization/test_labels.csv', index=False)