In [None]:
"""
This section is a material informatics project aiming to build a quantitative composition-structure-property (QSPR) relationship between palladium-based alloys membranes and their hydrogen permeability.
...
"""

'\nThis section is a material informatics project aiming to build a quantitative composition-structure-property (QSPR) relationship between palladium-based alloys membranes and their hydrogen permeability.\n...\n'

In [None]:
# !pip install --upgrade numpy

In [None]:
# Fresh Colab notebook
# !pip install rdkit
# !pip install xenonpy
!pip install pymatgen matminer
!pip install ydata-profiling
!pip install CBFV

Collecting pymatgen
  Downloading pymatgen-2025.6.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting matminer
  Downloading matminer-0.9.3-py3-none-any.whl.metadata (4.9 kB)
Collecting bibtexparser>=1.4.0 (from pymatgen)
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting monty>=2025.1.9 (from pymatgen)
  Downloading monty-2025.3.3-py3-none-any.whl.metadata (3.6 kB)
Collecting palettable>=3.3.3 (from pymatgen)
  Downloading palettable-3.3.3-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting ruamel.yaml>=0.17.0 (from pymatgen)
  Downloading ruamel.yaml-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting spglib>=2.5 (from pymatgen)
  Downloading spglib-2.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting uncertainties>=3.1.4 (

In [None]:
# Import useful packages and dependencies
import os
import pandas as pd
import numpy as np
import scipy as sc
import re
from ydata_profiling import ProfileReport # This library is handy at generating automatic EDA report
import CBFV
from CBFV.composition import generate_features
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format='retina' # Allow image to have high resolution
np.random.seed(42) # Ensure reproducibility

In [None]:
print(CBFV.__file__)

/usr/local/lib/python3.11/dist-packages/CBFV/__init__.py


In [None]:
from pymatgen.core import Element, Composition
from matminer.featurizers.composition import ElementFraction
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition.packing import AtomicPackingEfficiency
from matminer.featurizers.composition.alloy import YangSolidSolution
from matminer.featurizers.composition.alloy import WenAlloys

In [None]:
# Mounting google drive to import and export file from and to it
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Setting this options allow Pandas to display every columns. By default only few are displayed.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
%%time
PATH_train_70= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_train_70.pkl'  # In reality 80/20
PATH_test_70= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_test_70.pkl'

PATH_train_80= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_train_80.pkl'   # In reality 85/15
PATH_test_80= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_test_80.pkl'

PATH_train_90= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_train_90.pkl'    # In reality 95/5
PATH_test_90= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/df_test_90.pkl'

#%%
df_train_70=pd.read_pickle(PATH_train_70)
df_test_70=pd.read_pickle(PATH_test_70)

df_train_80=pd.read_pickle(PATH_train_80)
df_test_80=pd.read_pickle(PATH_test_80)

df_train_90=pd.read_pickle(PATH_train_90)
df_test_90=pd.read_pickle(PATH_test_90)

#%%
print(f'The shape of df_train_70 is {df_train_70.shape}')
print(f'The shape of df_test_70 is {df_test_70.shape}')

print(f'The shape of df_train_80 is {df_train_80.shape}')
print(f'The shape of df_test_80 is {df_test_80.shape}')

print(f'The shape of df_train_90 is {df_train_90.shape}')
print(f'The shape of df_test_90 is {df_test_90.shape}')

The shape of df_train_70 is (1788, 50)
The shape of df_test_70 is (435, 50)
The shape of df_train_80 is (1883, 50)
The shape of df_test_80 is (340, 50)
The shape of df_train_90 is (2092, 50)
The shape of df_test_90 is (131, 50)
CPU times: user 38.1 ms, sys: 8.26 ms, total: 46.3 ms
Wall time: 3.07 s


In [None]:
df_train_90.head()

Unnamed: 0,Composition in mole percent,Thickness,Lattice parameter,Bravais lattice,Chemical group,Temperature,Pressure difference,Permeability,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb,Stratify Group
0,Pd,0.00025,3.887e-10,fcc,G10,737.15,325.7194,9.24e-09,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G10
1,Pd,0.0007,3.89e-10,fcc,G10,673.15,730.0685,1.32e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G10
2,Pd,0.0007,3.89e-10,fcc,G10,673.15,632.4555,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G10
3,Pd,0.0007,3.89e-10,fcc,G10,673.15,516.7204,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G10
4,Pd,0.0007,3.89e-10,fcc,G10,623.15,816.7007,1.12e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G10


In [None]:
# Change the place of the Stratify group data column
def change_place(dataframe, column, loc):
    """
    Move a column to a new position (loc) in a DataFrame.

    Parameters:
    ----------
    dataframe : pd.DataFrame
        Input DataFrame.
    column : str
        Column name to move.
    loc : int
        Target location (0-indexed).

    Returns:
    -------
    dfc : pd.DataFrame
        DataFrame with column moved.
    """
    dfc = dataframe.copy()
    value = dfc.pop(column)  # Remove the column
    dfc.insert(loc, column, value)  # Insert it at the new location
    return dfc

In [None]:
%%time
# Applying the function
df_train_70=change_place(df_train_70, column='Stratify Group', loc=5)
df_test_70=change_place(df_test_70, column='Stratify Group', loc=5)

df_train_80=change_place(df_train_80, column='Stratify Group', loc=5)
df_test_80=change_place(df_test_80, column='Stratify Group', loc=5)

df_train_90=change_place(df_train_90, column='Stratify Group', loc=5)
df_test_90=change_place(df_test_90, column='Stratify Group', loc=5)

CPU times: user 12.4 ms, sys: 938 µs, total: 13.3 ms
Wall time: 21 ms


In [None]:
%%time
# check if the function worked (Yes :))
df_train_90.head()

CPU times: user 167 µs, sys: 0 ns, total: 167 µs
Wall time: 174 µs


Unnamed: 0,Composition in mole percent,Thickness,Lattice parameter,Bravais lattice,Chemical group,Stratify Group,Temperature,Pressure difference,Permeability,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
0,Pd,0.00025,3.887e-10,fcc,G10,G10,737.15,325.7194,9.24e-09,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pd,0.0007,3.89e-10,fcc,G10,G10,673.15,730.0685,1.32e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pd,0.0007,3.89e-10,fcc,G10,G10,673.15,632.4555,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pd,0.0007,3.89e-10,fcc,G10,G10,673.15,516.7204,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Pd,0.0007,3.89e-10,fcc,G10,G10,623.15,816.7007,1.12e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
%%time
# Average_lattice_Pd = np.sum(df_train_70[df_train_70['Composition in mole percent']=='Pd']['Lattice parameter'].unique())

def replace_lattice(dataframe: pd.DataFrame, composition_col, lattice_col: str) -> pd.DataFrame:
    """
    Replaces the lattice parameter for rows where composition is 'Pd'
    with the mean lattice parameter of all 'Pd' rows.

    Parameters:
    ----------
    dataframe : pd.DataFrame
        Input DataFrame.
    composition_col : str
        Name of the composition column.
    lattice_col : str
        Name of the lattice parameter column.

    Returns:
    -------
    pd.DataFrame
        Modified DataFrame.
    """

    # create a boolean mask
    bool_mask = dataframe[composition_col] == 'Pd'  # Create a boolean mask

    # Use .loc with the mask to select and modify the desired values
    dataframe.loc[bool_mask, lattice_col] = dataframe.loc[bool_mask, lattice_col].mean()

    return dataframe  # Return the modified dataframe

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 13.8 µs


In [None]:
%%time
# Applying the function
df_train_70=replace_lattice(dataframe=df_train_70, composition_col='Composition in mole percent', lattice_col='Lattice parameter')
df_test_70=replace_lattice(dataframe=df_test_70, composition_col='Composition in mole percent', lattice_col='Lattice parameter')

df_train_80=replace_lattice(dataframe=df_train_80, composition_col='Composition in mole percent', lattice_col='Lattice parameter')
df_test_80=replace_lattice(dataframe=df_test_80, composition_col='Composition in mole percent', lattice_col='Lattice parameter')

df_train_90=replace_lattice(dataframe=df_train_90, composition_col='Composition in mole percent', lattice_col='Lattice parameter')
df_test_90=replace_lattice(dataframe=df_test_90, composition_col='Composition in mole percent', lattice_col='Lattice parameter')

CPU times: user 8.68 ms, sys: 0 ns, total: 8.68 ms
Wall time: 8.81 ms


In [None]:
%%time
# Apply the function
df_test_70.head()

CPU times: user 127 µs, sys: 0 ns, total: 127 µs
Wall time: 131 µs


Unnamed: 0,Composition in mole percent,Thickness,Lattice parameter,Bravais lattice,Chemical group,Stratify Group,Temperature,Pressure difference,Permeability,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
137,Pd20.22Ag79.78,2.7e-05,4.041e-10,fcc,G10-G11,G10-G11,623.15,1154.6921,0.0,"(Pd, Ag)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2022,0.7978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138,Pd20.22Ag79.78,2.7e-05,4.041e-10,fcc,G10-G11,G10-G11,623.15,468.1478,0.0,"(Pd, Ag)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2022,0.7978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,Pd28.48Cu71.52,0.0001,3.321517e-10,bcc/fcc,G10-G11,G10-G11,1173.28,1612.4516,1.41e-09,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,Pd28.48Cu71.52,0.0001,3.321517e-10,bcc/fcc,G10-G11,G10-G11,1040.11,1612.4516,8.04e-10,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,Pd28.48Cu71.52,2.4e-05,3.321517e-10,bcc/fcc,G10-G11,G10-G11,623.15,1154.6921,0.0,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
%%time
#  # Compute average lattice parameter of pure Pd
# Pd_mask = df_train_90['Composition in mole percent'] == 'Pd'
# Pd_lattice_mean = float(df_train_90.loc[Pd_mask, 'Lattice parameter'].unique().mean())
# Pd_lattice_mean

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.68 µs


In [None]:
%%time
# Calculate lattice parameter difference relative to palladium

def calculate_lattice_parameter_deviation(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates relative lattice parameter deviation from Pd ('Δa_ss/a_Pd').

    Adds a new column 'Δa_ss/a_Pd' to the DataFrame.

    Parameters:
    ----------
    df : pd.DataFrame
        Must contain 'Composition in mole percent' and 'Lattice parameter' columns.

    Returns:
    -------
    df : pd.DataFrame
        Updated DataFrame with new column.
    """

    dfc = df.copy()  # To avoid modifying original

    if 'Lattice parameter' not in dfc.columns or 'Composition in mole percent' not in dfc.columns:
        raise ValueError("Required columns 'Composition in mole percent' and 'Lattice parameter' not found.")

    # Identify Pd rows
    Pd_mask = dfc['Composition in mole percent'] == 'Pd'

    # If Pd not present - fallback value
    if Pd_mask.sum() == 0:
        Pd_lattice_mean = 3.8901037037037046e-10
    else:
        Pd_lattice_mean = dfc.loc[Pd_mask, 'Lattice parameter'].mean()

    # Compute deviation
    dfc['Δa_ss/a_Pd'] = dfc['Lattice parameter'].apply(
        lambda x: 0.0 if pd.isna(x) else round(((x - Pd_lattice_mean) / Pd_lattice_mean), 5)
    )

    # Force Pd rows to zero (safety)
    dfc.loc[Pd_mask, 'Δa_ss/a_Pd'] = 0.0

    # Move column to position 3
    temp_col = dfc.pop('Δa_ss/a_Pd')
    dfc.insert(3, 'Δa_ss/a_Pd', temp_col)

    return dfc

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 10.5 µs


In [None]:
%%time
# Applying the function
df_train_70 = calculate_lattice_parameter_deviation(df=df_train_70)
df_test_70 = calculate_lattice_parameter_deviation(df=df_test_70)

df_train_80 = calculate_lattice_parameter_deviation(df=df_train_80)
df_test_80 = calculate_lattice_parameter_deviation(df=df_test_80)

df_train_90 = calculate_lattice_parameter_deviation(df=df_train_90)
df_test_90 = calculate_lattice_parameter_deviation(df=df_test_90)

CPU times: user 79.2 ms, sys: 5.56 ms, total: 84.8 ms
Wall time: 97.6 ms


In [None]:
%%time
df_train_70.head()
# Pd_lattice=3.890102941176471e-10

CPU times: user 144 µs, sys: 25 µs, total: 169 µs
Wall time: 175 µs


Unnamed: 0,Composition in mole percent,Thickness,Lattice parameter,Δa_ss/a_Pd,Bravais lattice,Chemical group,Stratify Group,Temperature,Pressure difference,Permeability,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
0,Pd,0.00025,3.890104e-10,0.0,fcc,G10,G10,737.15,325.7194,9.24e-09,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,730.0685,1.32e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,632.4555,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,516.7204,1.26e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,623.15,816.7007,1.12e-08,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Featurize the dataframe to create bond properties
%%time
# """
# This code is modified after the peer-reviewed work of Upadesh Subedi. Note that some of the features computed using our method can be directly computed using Matminer alloy module
# Reference paper: https://doi.org/10.3390/met12060964
# Link to repo: https://github.com/subediupadesh/AutomaticFeaturizerMPEA/blob/main/AutomaticFeaturizerMPEA.ipynb
# Link to theory about the parameters (Yang&Zhang's paper): https://doi.org/10.1016/j.matchemphys.2011.11.021
# Ref_data_source for various methods used: https://hackingmaterials.lbl.gov/matminer/matminer.featurizers.composition.html#module-matminer.featurizers.composition.alloy
# Reference_Pymatgen: https://pymatgen.org/pymatgen.core.html
# """
# # Silence parts of the original code not needed for our calculation

# # from pymatgen.core.composition import Composition, Element # Already imported
# from matminer.featurizers.composition.alloy import Miedema, WenAlloys, YangSolidSolution #Importing featurizers
# # from matminer.featurizers.composition import ElementFraction # Already imported
# # from matminer.featurizers.conversions import StrToComposition #This line is not needed since we already have already applied the pymatgen.core.composition.Composition class which does same as StrToComposition
# from matminer.utils.data import MixingEnthalpy, DemlData # Importing databases
# from matminer.utils import data_files #for importing "Miedema.csv" present inside package of Matminer library
# from pymatgen.core.periodic_table import Element
# from pymatgen.core.composition import Composition, Element

# # Import the Guo_Element_property dataset to obtain the correct radius for 'atomic size difference' calculation
# Guo_Element_property_data = pd.read_csv('/content/drive/MyDrive/PhD_Pd alloy prediction/Guo_Element_property.csv') # Please replace your own path to the Guo_Element_property data


# # elem_prop_data = pd.read_csv(os.path.dirname(data_files.__file__) +'/Miedema.csv', na_filter = False) #for Miedema.csv present inside package of Matminer library
# # VEC_elements = elem_prop_data.set_index('element')['valence_electrons'].to_dict()

# #A Function to featurize the dataframe

# # Composition_matrix = dfc['Composition_matrix']

# def Genuine_YanGWen_featurizer(composition_matrix): #Takes a pd.Series() of the composition object we created earlier in the first notebook
#     elem_prop_data = pd.read_csv(os.path.dirname(data_files.__file__) +'/Miedema.csv', na_filter = False) #for Miedema.csv present inside package of Matminer library
#     VEC_elements = elem_prop_data.set_index('element')['valence_electrons'].to_dict()
#     properties = []
#     for index, value in composition_matrix.items():
#       # Access the value directly
#       Pd_alloy = list(value.as_dict().keys())
#       Pd_alloy_ = list(value.as_dict().values())
#       total_mole = sum(Pd_alloy_) # This line is not needed per se

#       mole_fraction = []
#       X_i = []
#       r_i = []
#       Tm_i = []
#       VEC_i =[]
#       R = 8.314


#       for i in Pd_alloy:
#         # Use 'index' to access the current composition
#         mole_fraction.append(composition_matrix[index].get_atomic_fraction(i)) # Calculates mole fraction of each atom in a Pd_alloy record using .get_atomic_fraction()
#         X_i.append(Element(i).X) # Calculates individual electronegativity using "Element" function
#         r_i.append(Element(i).atomic_radius) if Element(i).atomic_radius_calculated is None else r_i.append(Element(i).atomic_radius_calculated) # There are two functions present in Element class of pymatgen, so here checking using 'if conditional' in both functions to not miss any value.
#         Tm_i.append(Element(i).melting_point) # Calculating melting point of every element using "Element" class and function

#         try: VEC_i.append(DemlData().get_elemental_property(Element(i), "valence")) # VEC is also present in 2 locations in matminer, first is the function "DemlData()"
#         except KeyError:
#           #This part of the code was not working because the VEC_elements variable was not defined.  I've commented it out.  You'll need to define it or remove this exception handling
#           if i in VEC_elements: VEC_i.append(float(VEC_elements.get(i))) # VEC_elements is defined above
#         #     VEC_i.append(DemlData().get_elemental_property(Element(i), "valence")) #Using DemlData of Matminer package (consists 60 elements)
#         #     if i in VEC_elements: VEC_i.append(float(VEC_elements.get(i)))  # Using Miedema.csv dataset inside matminer package (consists 71 elements)


#       # Calculation of Atomic Radius Difference
#       r_bar = sum(np.multiply(mole_fraction, r_i))
#       term = (1-np.divide(r_i, r_bar))**2
#       atomic_size_difference = sum(np.multiply(mole_fraction, term))**0.5
#       # atomic_size_difference = YangSolidSolution().compute_delta(Composition(mpea)) # We could have applied this code to calculate it

#       # Electronegativity # This is the electonegativity difference
#       X_bar = sum(np.multiply(mole_fraction, X_i))
#       del_Chi = (sum(np.multiply(mole_fraction, (np.subtract(X_i, X_bar))**2)))**0.5 #This is the electronegativity

#       # Entropy of mixing
#       # del_Smix = -WenAlloys().compute_configuration_entropy(mole_fraction)*1000
#       del_Smix = -R*sum(np.multiply(mole_fraction, np.log(mole_fraction)))

      # Enthalpy of mixing
      # AB = []
      # C_i_C_j = []
      # del_Hab = []
      # for i in range(len(Pd_alloy)):
      #   for j in range(i, len(Pd_alloy)-1):
      #    AB.append(Pd_alloy[i] + Pd_alloy[j+1])
      #    C_i_C_j.append(mole_fraction[i]*mole_fraction[j+1])
      #    del_Hab.append(round(Miedema().deltaH_chem([Pd_alloy[i], Pd_alloy[j+1]], [0.5, 0.5], 'ss'),3)) # Calculating binary enthalpy of mixing pure component at 0.5-0.5 (equal) composition using Miedema class of "matminer" library

      # del_Hab.append(MixingEnthalpy().get_mixing_enthalpy(Element(Pd_alloy[i]), Element(Pd_alloy[j+1]))) # Matminer MixingEnthalpy
#       omega = np.multiply(del_Hab, 4)
#       del_Hmix = sum(np.multiply(omega, C_i_C_j))
#       # del_Hmix = 0 #Placeholder

#       # Average Melting Temperature
#       Tm = sum(np.multiply(mole_fraction, Tm_i))

#       # Omega Parameter # The del_Hmix is in KJ/mol therefore conversion is needed
#       Omega = (Tm*del_Smix)/abs(del_Hmix*1000)
#       # Omega = 0 #Placeholder
#       # Omega = YangSolidSolution().compute_omega(Composition(mpea))

#       # Valence Electron Concentration
#       VEC = sum(np.multiply(mole_fraction, VEC_i))

#       # Collecting all values
#       properties.append([len(Pd_alloy), " ".join(Pd_alloy), " ".join(list(map(str, Pd_alloy_))), total_mole, round(sum(mole_fraction),2), atomic_size_difference, round(del_Hmix, 4), round(del_Smix, 4), round(Omega, 4), round(del_Chi, 4), VEC, Tm])


#     prop_data = pd.DataFrame(properties, columns=['No of Components',
#                                                   'Component',
#                                                   'Moles of individual Components',
#                                                   'Total Moles',
#                                                   'Sum of individual Mole Fractions',
#                                                   'Atomic size diff (δ)',
#                                                   'ΔHmix',
#                                                   'ΔSmix',
#                                                   'Omega (Ω)',
#                                                   'Δχ',
#                                                   'VEC',
#                                                   'Tm(K)']
#                              )

#     # processed_data_2 = pd.concat([Processed_data_1, prop_data], axis = 1)
#     return prop_data

#1) Correct the code to recognize that pure metals such as 'Pd' should have mixing enthalpy =0, thus Omega parameter equals zero


#################################################################################################################################################
#################################################################################################################################################
#################################################################################################################################################
from matminer.featurizers.composition.alloy import Miedema, WenAlloys, YangSolidSolution #Importing featurizers
from matminer.utils.data import MixingEnthalpy, CohesiveEnergyData, DemlData # Importing databases
from matminer.utils import data_files #for importing "Miedema.csv" present inside package of Matminer library
from pymatgen.core.periodic_table import Element
from pymatgen.core.composition import Composition, Element
from matminer.featurizers.composition.packing import AtomicPackingEfficiency


# Import the Guo_Element_property dataset to obtain the correct radius for 'atomic size difference' calculation
Guo_Element_property_data = pd.read_csv('/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Guo_Element_property.csv', encoding='latin1') # Please replace your own path to the
elem_prop_data = pd.read_csv(os.path.dirname(data_files.__file__) +'/Miedema.csv', na_filter = False)
shear_modulus_element = dict(zip(elem_prop_data['element'], elem_prop_data['shear_modulus']))


# Import Guo dataset
kittel_radius = dict(zip(Guo_Element_property_data['Symbol'], Guo_Element_property_data['Radius/ Å']))
# iti_e = dict(zip(Guo_Element_property_data['Symbol'], Guo_Element_property_data['e/a']))


def compute_gamma_radii(r_i, r_bar):
    """
    Compute Gamma of the radii using metallic radii from Guo dataset.
    The solid angles of the
    atomic packing for the elements with the most significant
    and smallest atomic sizes.

    :math:`\frac{1 - \\sqrt{ \frac{((r + r_{min})^2 - r^2)}{(r + r_{min})^2}}}{1 - \\sqrt{ \frac{((r + r_{max})^2 - r^2)}{(r + r_{max})^2}}}`

    where :math:`r`, :math:`r_{min}` and :math:`r_{max}` are the mean radii
    min radii and max radii.

    Args:
        r_i (list): list of metallic radius of one compound
        r_bar (float): weighted average of the metallic radius

        Returns:
            (float) gamma
    """
    r_i_min = np.min(r_i)
    r_i_max = max(r_i)
    r_bar = r_bar

    numerator = 1 - np.sqrt(((r_bar + r_i_min)**2-r_bar**2) / ((r_bar + r_i_min)**2))
    denominator = 1 - np.sqrt(((r_bar + r_i_max)**2-r_bar**2) / ((r_bar + r_i_max)**2))

    return numerator/denominator

def compute_enthalpy(elements, fractions):
    """
    Compute mixing enthalpy.

    Args:
        elements ([pymatgen.Element or str]): List of elements
        fractions [float]: Fractions of elements in composition

    Returns:
        (float) H_mixing (signed value)
    """
    mixing_enthalpy_source = MixingEnthalpy()

    if len(elements) == 1:
        enthalpy = 0
    else:
        enthalpy = 0
        for i, e1 in enumerate(elements):
            for j, e2 in enumerate(elements[:i]):
                enthalpy += (
                    fractions[i]
                    * fractions[j]
                    * mixing_enthalpy_source.get_mixing_enthalpy(Element(e1), Element(e2))
                )
        enthalpy *= 4

        if enthalpy == 0:
            enthalpy = 1e-6  # Avoid zero if needed for division later

    return enthalpy

def Alloy_Featurizer(composition_matrix): # Takes a pd.Series() of the composition object we created earlier in the first notebook

    """
    The present function takes a composition object column (in our case, the composition objects are located in 'Composition_matrix' columns) and calculates descriptors
    according to Wen et al. (2019) paper: 'Machine learning assisted design of high entropy alloys with desired property'
    Reference:  https://doi.org/10.1016/j.actamat.2019.03.010.
    More details can be found in the supplementary file of the paper. Link:  https://ars.els-cdn.com/content/image/1-s2.0-S1359645419301430-mmc1.pdf

    We emphasize that this code is stricly for crystalline solid solutions like found in our Pd-alloys dataset.
    In the case the compounds are amorphous such as alloy membranes with Group IIIB and IVB as principal solution, the WenAlloys() featurizers from Matminer can be directly used.
    We aim to stay consistent with values published by Magnone et al. (2023) for Pd-alloys
    https://doi.org/10.1016/j.memsci.2023.121513.
    Also, note that matminer converts the units of values e.g J/... to KJ/... most of the time. So values found there might be different from those published by Magnone by a factor of 1000.
    Reference to Matminer source code:https://github.com/hackingmaterials/matminer/blob/main/matminer/featurizers/composition/alloy.py
    Reference Miracle's radius: https://doi.org/10.1179/095066010X12646898728200

    We will use dataset by Guo et al.(2011) to get the data of radius, which is same used by Magnone et al.(2023). The rest of the implementation is intact.
    Reference: https://doi.org/10.1016/S1002-0071(12)60080-X

    Finally, we also convert the units scale to stay consistent with Matminer WenAlloys()
    """

    waf = WenAlloys(impute_nan=True)
    ced = CohesiveEnergyData()
    deml = DemlData()
    VEC_elements = elem_prop_data.set_index('element')['valence_electrons'].to_dict()
    properties = []

    for index, value in composition_matrix.items():

      # Access elements and fractions
      Pd_alloy = list(value.as_dict().keys())
      Pd_alloy_ = list(value.as_dict().values())
      total_mole = sum(Pd_alloy_)

      mole_fraction = [] # List to get mole fractions
      X_i = []  # List to get Pauling electronegativity values
      r_i = []  # List to get metallic radii from Guo assembled dataset (see above)
      Tm_i = [] # List to get melting Temperature
      VEC_i =[] # List to get valence electrons concentration
      Cohesive_i = [] # List to get cohesive energy
      Shear_modulus_i = [] # List to get shear_modulii
      # itinerant_electron_i = [] # List to get e/a values for sum[c_i*(e/a)_i]
      R = 8.314

      for i in Pd_alloy:
        # Use 'index' to access the current composition
        mole_fraction.append(composition_matrix[index].get_atomic_fraction(i)) # We are giving mole percent values so, .get_atomic_fration(i) will only divide by 100.
        X_i.append(Element(i).X) # Calculates individual electronegativity using "Element" function
        r_i.append(kittel_radius[i]) # The value of r_i are collected from the book by Kittel 'Introduction to solid state Physics'
        Tm_i.append(Element(i).melting_point) # Calculating melting point of every element using "Element" class and function
        Cohesive_i.append(ced.get_elemental_property(elem=i, property_name='cohesive energy'))
        # itinerant_electron_i.append(iti_e[i])
        Shear_modulus_i.append(shear_modulus_element[i])

        try:
          VEC_i.append(deml.get_elemental_property(Element(i), "valence")) # Try to get VEC from "DemlData" dataset

        except KeyError:
          if i in VEC_elements:
            VEC_i.append(float(VEC_elements.get(i))) # get VEC from Miedema.csv

      # Calculation of Atomic Radius Difference
      r_bar = np.dot(mole_fraction, r_i)  # Unit in Angstrom
      term = (1-np.divide(r_i, r_bar))**2
      atomic_size_difference = (np.dot(mole_fraction, term)**0.5)

      # Electronegativity # This is the electonegativity difference
      X_bar = np.dot(mole_fraction, X_i)
      del_Chi = np.dot(mole_fraction, (np.subtract(X_i, X_bar))**2)**0.5  # This is the electronegativity

      # Compute chi_local_mismatch
      chi_local_mismatch = waf.compute_local_mismatch(variable = X_i, fractions = mole_fraction)

      # Valence Electron Concentration
      VEC = np.dot(mole_fraction, VEC_i)

      # Compute average cohesive
      mean_cohesive_energy = np.dot(mole_fraction, Cohesive_i)*96.4853 # to convert eV/atom ---> KJ/mol multiply by 96.4853

      # Mean shear modulus
      mean_shear_modulus = np.dot(Shear_modulus_i, mole_fraction)*10**9  # (in GPa) ---> Convert in Pa

      # Average Melting Temperature (K)
      Tm = np.dot(mole_fraction, Tm_i)

      # Compute delta_H_mix using modified Matminer's code. We force the code not to give us absolute value
      delta_H_mix =  compute_enthalpy(Pd_alloy, mole_fraction) # Returns enthalpy in KJ/mol, not abs(enthalpy) like in the original code

      # Entropy of mixing or Configuration entropy
      del_Smix = -R*np.dot(mole_fraction, np.log(mole_fraction))/1000 # J/K/mol --> KJ/K/mol

      # Omega Parameter (Unitless) # The del_Hmix is in KJ/mol therefore conversion is needed
      if del_Smix==0:
        Omega=0
      else:
        Omega = (Tm*del_Smix)/abs(delta_H_mix)

      # Lambda parameter (Entropy of mixing divided by squared atomic_size_difference)
      if atomic_size_difference != 0:
          Lambda_param = del_Smix/(atomic_size_difference**2)
      else:
          Lambda_param = 0   # KJ/K/mol

      # Gamma radii: Geometric parameter introduced originally by Wang(2015), https://doi.org/10.1016/j.scriptamat.2014.09.010
      gamma_radii = compute_gamma_radii(r_i, r_bar) # Unitless

      # Compute radii_local_mismatch (meter)
      radii_local_mismatch = waf.compute_local_mismatch(variable = r_i, fractions = mole_fraction)*1.0e-10 # Angstrom --> meter

      # Compute shear_modulus_local_mismatch (Pa)
      shear_modulus_local_mismatch = waf.compute_local_mismatch(variable = Shear_modulus_i, fractions = mole_fraction)*1.0e9 # GPa --> Pa

      # Compute modulus mismatch in strengthening model (Unitless)
      shear_modulus_strength_model = waf.compute_strength_local_mismatch_shear(Shear_modulus_i, mean_shear_modulus, mole_fraction)

      # Compute shear modulus delta (Unitless)
      shear_modulus_delta = waf.compute_delta(Shear_modulus_i, mole_fraction)

      # Compute atomic packing efficiency  (Unitless)
      ape = AtomicPackingEfficiency(impute_nan=False)
      mean_APE = ape.compute_simultaneous_packing_efficiency(value)[0]

      # Compute itinerant electron  per atom (Link to ref: https://new.math.uiuc.edu/oldnew/quasicrystals/IonBaianu/MizutaniLect-CMA1EU.pdf#page=4.00)
      # itinerant_e_per_atom = np.dot(itinerant_electron_i,  mole_fraction)     # Inconsistent data

      # Collecting all values
      properties.append([len(Pd_alloy),
                         " ".join(Pd_alloy),
                         " ".join(list(map(str, Pd_alloy_))),
                         total_mole,
                         round(sum(mole_fraction),2),
                         atomic_size_difference,
                         del_Chi,
                         delta_H_mix,
                         del_Smix,
                         Omega,
                         VEC,
                         Tm,
                         Lambda_param,
                         gamma_radii,
                         mean_cohesive_energy,
                         radii_local_mismatch,
                         chi_local_mismatch,
                         mean_shear_modulus,
                         shear_modulus_strength_model,
                         shear_modulus_delta,
                         shear_modulus_local_mismatch,
                         mean_APE
                        #  itinerant_e_per_atom
                         ])

    prop_data = pd.DataFrame(properties, columns=[
                                                  'No of Components',
                                                  'Component',
                                                  'Moles of individual Components',
                                                  'Total Moles',
                                                  'Sum of individual Mole Fractions',
                                                  'Atomic Size diff. (δ)',
                                                  'Δχ',  # (percent or unitless)
                                                  'ΔHmix', # (KJ/mol)
                                                  'ΔSmix', # (J/K/mol) ---> KJ/K/mol
                                                  'Omega (Ω)', # (unitless)
                                                  'VEC', # (percentage)
                                                  'Melting Tempurature', # (unit: Kelvin)
                                                  'Λ',  # Entropy devided by squared atomic_size_difference # (J/K/mol) ---> KJ/K/mol
                                                  'γ', # gamma_radii (unitless)
                                                  'Mean Cohesive Energy', #  (eV/atom) or KJ/mol add conversion factor (multiply by 96.4853 to convert to KJ/mol)
                                                  'D⋅r',  # radii local mismatch (needs unit same as r (Å)) Å --> m
                                                  'D⋅χ',  # Electronegativity local mismatch
                                                  'G',  # mean_shear_modulus (GPa)
                                                  'η',  # shear_modulus_strength_model (unitless)
                                                  'δG',   # shear_modulus_delta
                                                  'D⋅G', # Shear modulus local mismatch (Pa)
                                                  'Mean APE', # Mean atomic packing efficiency
                                                  # 'e/a'     # itinerant electron per atom  (we avoid due to iinconsistent e/a data in the literature)
                                                  ]
                             )
    return prop_data
#################################################################################################################################################
#################################################################################################################################################
#################################################################################################################################################

CPU times: user 10.2 ms, sys: 691 µs, total: 10.9 ms
Wall time: 455 ms


In [None]:
%%time
# It takes up to 40 min for entire featurization!!!

#%%
column_train_70 = df_train_70['Composition_matrix']
column_test_70 = df_test_70['Composition_matrix']
column_train_80 = df_train_80['Composition_matrix']
column_test_80 = df_test_80['Composition_matrix']
column_train_90 = df_train_90['Composition_matrix']
column_test_90 = df_test_90['Composition_matrix']

#%%
bond_properties_train_70 = Alloy_Featurizer(column_train_70)
bond_properties_test_70 = Alloy_Featurizer(column_test_70)
bond_properties_train_80 = Alloy_Featurizer(column_train_80)
bond_properties_test_80 = Alloy_Featurizer(column_test_80)
bond_properties_train_90 = Alloy_Featurizer(column_train_90)
bond_properties_test_90 = Alloy_Featurizer(column_test_90)

#%%
list_bond_properties = [bond_properties_train_70,
                        bond_properties_test_70,
                        bond_properties_train_80,
                        bond_properties_test_80,
                        bond_properties_train_90,
                        bond_properties_test_90]

for bp in list_bond_properties:
  print(bp.shape)


(1788, 22)
(435, 22)
(1883, 22)
(340, 22)
(2092, 22)
(131, 22)
CPU times: user 7min 41s, sys: 8.84 s, total: 7min 50s
Wall time: 8min


In [None]:
%%time
# Make index match (Security measure)
bond_properties_train_70.index=df_train_70.index
bond_properties_test_70.index=df_test_70.index

bond_properties_train_80.index=df_train_80.index
bond_properties_test_80.index=df_test_80.index

bond_properties_train_90.index=df_train_90.index
bond_properties_test_90.index=df_test_90.index

CPU times: user 85 µs, sys: 2 µs, total: 87 µs
Wall time: 89.4 µs


In [None]:
bond_properties_test_90.head()

Unnamed: 0,No of Components,Component,Moles of individual Components,Total Moles,Sum of individual Mole Fractions,Atomic Size diff. (δ),Δχ,ΔHmix,ΔSmix,Omega (Ω),VEC,Melting Tempurature,Λ,γ,Mean Cohesive Energy,D⋅r,D⋅χ,G,η,δG,D⋅G,Mean APE
168,3,Pd Cu Al,40.21 57.86 1.93,100.0,1.0,0.037721,0.156779,-14.501289,0.006311,0.669657,10.4435,1538.680598,4.435554,1.131804,352.066176,2.472961e-12,0.077614,47176740000.0,-1.344492,0.086944,1788601000.0,-0.007495
205,3,Pd Cu Al,44.5 55.0 0.5,100.0,1.0,0.036868,0.151767,-14.1264,0.00595,0.659095,10.515,1564.9231,4.377115,1.13172,353.859838,2.429107e-12,0.075535,47698400000.0,-1.335545,0.069645,1569147000.0,-0.009566
206,3,Pd Cu Al,44.5 55.0 0.5,100.0,1.0,0.036868,0.151767,-14.1264,0.00595,0.659095,10.515,1564.9231,4.377115,1.13172,353.859838,2.429107e-12,0.075535,47698400000.0,-1.335545,0.069645,1569147000.0,-0.009566
207,3,Pd Cu Al,44.5 55.0 0.5,100.0,1.0,0.036868,0.151767,-14.1264,0.00595,0.659095,10.515,1564.9231,4.377115,1.13172,353.859838,2.429107e-12,0.075535,47698400000.0,-1.335545,0.069645,1569147000.0,-0.009566
208,3,Pd Cu Al,44.5 55.0 0.5,100.0,1.0,0.036868,0.151767,-14.1264,0.00595,0.659095,10.515,1564.9231,4.377115,1.13172,353.859838,2.429107e-12,0.075535,47698400000.0,-1.335545,0.069645,1569147000.0,-0.009566


In [None]:
%%time
# Add atomic packing efficiency to the descriptor list.
# def APE_calculator(df:pd.DataFrame, column_name:str)->pd.DataFrame:
#   at_pack_ef = AtomicPackingEfficiency()
#   df=at_pack_ef.featurize_dataframe(df,column_name, ignore_errors=True)
#   df.drop(columns = ['mean abs simul. packing efficiency', 'dist from 1 clusters |APE| < 0.010', 'dist from 3 clusters |APE| < 0.010', 'dist from 5 clusters |APE| < 0.010'], axis = 0, inplace = True)
#   # Renaming the 'mean simul. packing efficiency' column into Average APE
#   df.rename(columns = {'mean simul. packing efficiency': 'Average APE'}, inplace = True)
#   return df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [None]:
%%time
# Applying the function
# df_train_70=APE_calculator(df_train_70, column_name="Composition_matrix")
# df_test_70=APE_calculator(df_test_70, column_name="Composition_matrix")
# df_train_80=APE_calculator(df_train_80, column_name="Composition_matrix")
# df_test_80=APE_calculator(df_test_80, column_name="Composition_matrix")
# df_train_90=APE_calculator(df_train_90, column_name="Composition_matrix")
# df_test_90=APE_calculator(df_test_90, column_name="Composition_matrix")

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [None]:
%%time
# Append the 'Average APE' column to bond_properties columns


data_list=[df_train_70,
           df_test_70,
           df_train_80,
           df_test_80,
           df_train_90,
           df_test_90]


list_bond_properties=[bond_properties_train_70,
                      bond_properties_test_70,
                      bond_properties_train_80,
                      bond_properties_test_80,
                      bond_properties_train_90,
                      bond_properties_test_90]


to_drop=['Component','Moles of individual Components', 'Total Moles',	'Sum of individual Mole Fractions']

# def column_reappender(data_list, bond_properties_list, to_drop):

#     """
#     Transfers the 'Average APE' column from each DataFrame in data_list
#     and appends it to the corresponding DataFrame in bond_properties_list.

#     Parameters:
#     ----------
#     data_list : list of pd.DataFrame
#         List of DataFrames containing the 'Average APE' column to remove.
#     bond_properties_list : list of pd.DataFrame
#         List of DataFrames to which the column will be appended.

#     Returns:
#     -------
#     updated_data_list : list of pd.DataFrame
#         DataFrames with 'Average APE' removed.
#     updated_bond_properties_list : list of pd.DataFrame
#         DataFrames with 'Average APE' appended.
#     """

#     updated_data_list = []
#     updated_bond_properties_list = []

#     for df_data, df_bond in zip(data_list, bond_properties_list):
#         df_data = df_data.copy()
#         df_bond = df_bond.copy()

#         # Pop 'Average APE' from the data frame
#         avg_ape_col = df_data.pop('Average APE')

#         # Append it to the bond properties frame
#         df_bond['Average APE'] = avg_ape_col.values
#         df_bond=df_bond.drop(columns=to_drop, axis=1)

#         updated_data_list.append(df_data)
#         updated_bond_properties_list.append(df_bond)

#     return updated_data_list, updated_bond_properties_list


def column_reappender(bond_properties_list, to_drop):

    """
    Drop few columns from the bond_properties.

    Parameters:
    ----------
    data_list : list of pd.DataFrame
        bond_properties_list : list of pd.DataFrame
    to_drop: List of columns to drop

    Returns:
    -------
    updated_bond_properties_list : list of pd.DataFrame
        DataFrames with 'Average APE' appended.
    """

    updated_bond_properties_list = []

    for df_bond in bond_properties_list:

        df_bond_c = df_bond.copy()

        # Append it to the bond properties frame
        df_bond_c = df_bond_c.drop(columns=to_drop)

        updated_bond_properties_list.append(df_bond_c)

    return updated_bond_properties_list

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 9.3 µs


In [None]:
%%time
# data_list = [df_train_70, df_test_70, df_train_80, df_test_80, df_train_90, df_test_90]
# list_bond_properties = [bond_properties_train_70, bond_properties_test_70, bond_properties_train_80, bond_properties_test_80, bond_properties_train_90, bond_properties_test_90]

# to_drop = ['Component','Moles of individual Components', 'Total Moles',	'Sum of individual Mole Fractions']

# # Applying the function
# updated_data_list, updated_bond_properties_list = column_reappender(data_list, list_bond_properties, to_drop)
# df_train_70, df_test_70, df_train_80, df_test_80, df_train_90, df_test_90 = updated_data_list
# bond_properties_train_70, bond_properties_test_70, bond_properties_train_80, bond_properties_test_80, bond_properties_train_90, bond_properties_test_90 = updated_bond_properties_list


# data_list = [df_train_70, df_test_70, df_train_80, df_test_80, df_train_90, df_test_90]

list_bond_properties = [bond_properties_train_70, bond_properties_test_70, bond_properties_train_80, bond_properties_test_80, bond_properties_train_90, bond_properties_test_90]

to_drop = ['Component','Moles of individual Components', 'Total Moles',	'Sum of individual Mole Fractions']

# Applying the function
updated_bond_properties_list = column_reappender(list_bond_properties, to_drop)
bond_properties_train_70, bond_properties_test_70, bond_properties_train_80, bond_properties_test_80, bond_properties_train_90, bond_properties_test_90 = updated_bond_properties_list

CPU times: user 5.65 ms, sys: 0 ns, total: 5.65 ms
Wall time: 5.8 ms


In [None]:
# Replace the NAN in the parameter 'Omega (Ω)' column by 0
# for bp in [bond_properties_train_70, bond_properties_train_80, bond_properties_train_90]:
#   bp.loc[bp['Omega (Ω)'].isna(), 'Omega (Ω)'] = 0

In [None]:
%%time
for bp in [bond_properties_train_70,
           bond_properties_test_70,
           bond_properties_train_80,
           bond_properties_test_80,
           bond_properties_train_90,
           bond_properties_test_90]:

  print(f'Shape={bp.shape}')
  print(bp.isnull().sum())
  print('*'*10, end='\n\n')

Shape=(1788, 18)
No of Components         0
Atomic Size diff. (δ)    0
Δχ                       0
ΔHmix                    0
ΔSmix                    0
Omega (Ω)                0
VEC                      0
Melting Tempurature      0
Λ                        0
γ                        0
Mean Cohesive Energy     0
D⋅r                      0
D⋅χ                      0
G                        0
η                        0
δG                       0
D⋅G                      0
Mean APE                 0
dtype: int64
**********

Shape=(435, 18)
No of Components         0
Atomic Size diff. (δ)    0
Δχ                       0
ΔHmix                    0
ΔSmix                    0
Omega (Ω)                0
VEC                      0
Melting Tempurature      0
Λ                        0
γ                        0
Mean Cohesive Energy     0
D⋅r                      0
D⋅χ                      0
G                        0
η                        0
δG                       0
D⋅G                     

In [None]:
%%time
bond_properties_train_70.head()

CPU times: user 102 µs, sys: 3 µs, total: 105 µs
Wall time: 108 µs


Unnamed: 0,No of Components,Atomic Size diff. (δ),Δχ,ΔHmix,ΔSmix,Omega (Ω),VEC,Melting Tempurature,Λ,γ,Mean Cohesive Energy,D⋅r,D⋅χ,G,η,δG,D⋅G,Mean APE
0,1,0.0,0.0,0.0,-0.0,0.0,10.0,1828.05,0.0,1.0,375.327817,0.0,0.0,51110000000.0,0.0,0.0,0.0,0.023994
1,1,0.0,0.0,0.0,-0.0,0.0,10.0,1828.05,0.0,1.0,375.327817,0.0,0.0,51110000000.0,0.0,0.0,0.0,0.023994
2,1,0.0,0.0,0.0,-0.0,0.0,10.0,1828.05,0.0,1.0,375.327817,0.0,0.0,51110000000.0,0.0,0.0,0.0,0.023994
3,1,0.0,0.0,0.0,-0.0,0.0,10.0,1828.05,0.0,1.0,375.327817,0.0,0.0,51110000000.0,0.0,0.0,0.0,0.023994
4,1,0.0,0.0,0.0,-0.0,0.0,10.0,1828.05,0.0,1.0,375.327817,0.0,0.0,51110000000.0,0.0,0.0,0.0,0.023994


In [None]:
%%time
# Featurize the dataframe

def change_place(dataframe, column='Temperature', loc=2):
    """
    Move a column to a new position (loc) in a DataFrame.
    """
    dfc = dataframe.copy()
    value = dfc.pop(column)
    dfc.insert(loc, column, value)
    return dfc

def cbfv_featurization(dataframe, elem_prop, column_list=['formula', 'target']):
    """
    Prepares and featurizes a dataset for CBFV model training.

    Parameters:
    ----------
    dataframe : pd.DataFrame
        Input DataFrame.
    elem_prop : str
        Custom element property set to use for featurization. # This will be written to the CBFV elem_prop repo and used for featurization instead than Olyinyk's
    column_list : list of str, optional
        List of columns ['formula', 'target',] required for CBFV featurization.

    Returns:
    -------
    X : pd.DataFrame
        Feature matrix.
    y : np.ndarray
        Target vector.
    formulae : np.ndarray
        List of chemical formulae.
    skipped_train : pd.DataFrame
        Skipped samples during featurization (optional handling).
    """
    # Rename important columns
    rename_dict = {
        'Composition in mole percent': 'formula',
        'Permeability': 'target'
    }

    dfc = dataframe.copy()
    dfc = dfc.rename(columns=rename_dict)

    # Rearrange columns if needed
    # dfc = change_place(dfc, column='temp', loc=2)

    # Keep only necessary columns
    dfc = dfc[column_list]

    # Perform featurization
    X, y, formulae, skipped = generate_features(
        dfc,
        elem_prop=elem_prop,
        drop_duplicates=False,
        extend_features=True,
        sum_feat=False
    )

    X_columns = [name_prop for name_prop in X.columns.to_list() if str(name_prop).startswith('avg')] #  We  only want to retain the 'avg_' features to stay consistent with WenAlloys(), else we will have 450 features with our element_prop dataset.

    X=X[X_columns]

    return X, y, formulae, skipped

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.44 µs


In [None]:
%%time
#Featurization using cbfv
#%%

PATH_1 = '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/oliynyk_extended_std_unit.csv'
PATH = '/usr/local/lib/python3.11/dist-packages/CBFV/element_properties/oliynyk_extended_std_unit.csv'

oliynyk_extended_dataset = pd.read_csv(PATH_1)

rename_dict = {
 'Atomic weight (g/mol)':'Atomic weight',
 'Ionization energy (kJ/mol)':'Ionization energy',
 'Electron affinity (kJ/mol)':'Electron affinity',
 'Atomic radius calculated (m)':'Atomic radius calculated',
 'Covalent radius(m)':'Covalent radius',
 'Effective ionic radius(m)':'Effective ionic radius',
 'Miracle radius (m)':'Miracle radius',
 'van der Waals radius (m)':'van der Waals radius',
 'Slater radius(m)':'Slater radius',
 'Pauling, R(CN12) (m)':'Pauling, R(CN12)',
 'Pauling, R(1) (m)':'Pauling, R(1)',
 'Polarizability (m^3)':'Polarizability',
 'Boiling point (K)':'Boiling point',
 'Density (g/L)':'Density',
 'Specific heat (kJ/g K)':'Specific heat',
 'Heat of fusion (kJ/mol)':'Heat of fusion',
 'Heat of vaporization (kJ/mol)': 'Heat of vaporization',
 'Heat of atomization (kJ/mol)': 'Heat of atomization',
 'Thermal conductivity (W/m K)':'Thermal conductivity',
 'Thermal expansion (1/K)': 'Thermal expansion',
 'Cohesive energy (kJ/mol)':'Cohesive energy',
 'Bulk modulus (Pa)':'Bulk modulus',
 'Shear modulus (Pa)':'Shear modulus',
 "Young's modulus (Pa)":"Young's modulus",
 'Hardness Vickers (Pa)':'Hardness Vickers',
 'Hardness Brinell (Pa)':'Hardness Brinell',
 'Resistivity (Ω)':'Resistivity',
 'Electrical Conductivity (S/m)':'Electrical Conductivity',
 'DFT LDA Etot (kJ/mol)':'DFT LDA Etot',
 'DFT LDA Ekin (kJ/mol)':'DFT LDA Ekin',
 'DFT LDA Ecoul (kJ/mol)':'DFT LDA Ecoul',
 'DFT LDA Eenuc (kJ/mol)':'DFT LDA Eenuc',
 'DFT LDA Exc (kJ/mol)':'DFT LDA Exc',
 'DFT LSD Etot (kJ/mol)':'DFT LSD Etot',
 'DFT LSD Ekin (kJ/mol)':'DFT LSD Ekin',
 'DFT LSD Ecoul (kJ/mol)':'DFT LSD Ecoul',
 'DFT LSD Eenuc (kJ/mol)':'DFT LSD Eenuc',
 'DFT LSD Exc (kJ/mol)':'DFT LSD Exc',
 'DFT RLDA Etot (kJ/mol)':'DFT RLDA Etot',
 'DFT RLDA Ekin (kJ/mol)':'DFT RLDA Ekin',
 'DFT RLDA Ecoul (kJ/mol)':'DFT RLDA Ecoul',
 'DFT RLDA Eenuc (kJ/mol)':'DFT RLDA Eenuc',
 'DFT RLDA Exc (kJ/mol)':'DFT RLDA Exc',
 'DFT ScRLDA Etot (kJ/mol)':'DFT ScRLDA Etot',
 'DFT ScRLDA Ekin (kJ/mol)':'DFT ScRLDA Ekin',
 'DFT ScRLDA Ecoul (kJ/mol)':'DFT ScRLDA Ecoul',
 'DFT ScRLDA Eenuc (kJ/mol)':'DFT ScRLDA Eenuc',
 'DFT ScRLDA Exc (kJ/mol)':'DFT ScRLDA Exc'
}

oliynyk_extended_dataset = oliynyk_extended_dataset.rename(columns=rename_dict)

oliynyk_extended_dataset.to_csv(PATH, index=False)

cbfv_df_train_70, permeability_train_70, formulae_train_70, skipped_train_70 = cbfv_featurization(df_train_70, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

cbfv_df_test_70, permeability_test_70, formulae_test_70, skipped_test_70 = cbfv_featurization(df_test_70, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

cbfv_df_train_80, permeability_train_80, formulae_train_80, skipped_train_80 = cbfv_featurization(df_train_80, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

cbfv_df_test_80, permeability_test_80, formulae_test_80, skipped_test_80 = cbfv_featurization(df_test_80, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

cbfv_df_train_90, permeability_train_90, formulae_train_90, skipped_train_90 = cbfv_featurization(df_train_90, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

cbfv_df_test_90, permeability_test_90, formulae_test_90, skipped_test_90 = cbfv_featurization(df_test_90, elem_prop='oliynyk_extended_std_unit', column_list=['formula', 'target'])

Processing Input Data: 100%|██████████| 1788/1788 [00:00<00:00, 6985.15it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 1788/1788 [00:00<00:00, 11100.44it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 435/435 [00:00<00:00, 18691.96it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 435/435 [00:00<00:00, 13201.56it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 1883/1883 [00:00<00:00, 26034.57it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 1883/1883 [00:00<00:00, 12501.80it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 340/340 [00:00<00:00, 20535.74it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 340/340 [00:00<00:00, 13084.59it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 2092/2092 [00:00<00:00, 23846.95it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 2092/2092 [00:00<00:00, 14341.28it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████| 131/131 [00:00<00:00, 17734.61it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████| 131/131 [00:00<00:00, 9710.07it/s]


	Creating Pandas Objects...
CPU times: user 2.19 s, sys: 87.4 ms, total: 2.28 s
Wall time: 3.01 s


In [None]:
%%time
#Reindex the cbfv dataset
cbfv_df_train_70.index = df_train_70.index
cbfv_df_test_70.index = df_test_70.index
cbfv_df_train_80.index = df_train_80.index
cbfv_df_test_80.index = df_test_80.index
cbfv_df_train_90.index = df_train_90.index
cbfv_df_test_90.index = df_test_90.index

CPU times: user 106 µs, sys: 0 ns, total: 106 µs
Wall time: 110 µs


In [None]:
cbfv_df_train_70.head()

Unnamed: 0,avg_Atomic weight,avg_Atomic number,avg_Period,avg_Group,avg_Quantum number ℓ,avg_Metal(1)/metalloid(2)/nonmetal(3),avg_Mendeleev_number,avg_Families,avg_Valence_s,avg_Valence_p,avg_Valence_d,avg_Valence_f,avg_Unfilled_s,avg_Unfilled_p,avg_Unfilled_d,avg_Unfilled_f,avg_No. of valence electrons,avg_Outer shell electrons,avg_Gilman no. of valence electrons,avg_Metallic valence,avg_Zeff,avg_Ionization energy,avg_Electron affinity,avg_Pauling EN,avg_Martynov Batsanov EN,avg_Mulliken EN,avg_Allred EN,avg_Allred Rockow EN,avg_Nagle EN,avg_Ghosh EN,avg_Atomic radius calculated,avg_Covalent radius,avg_Effective ionic radius,avg_Miracle radius,avg_van der Waals radius,avg_Slater radius,"avg_Pauling, R(CN12)","avg_Pauling, R(1)",avg_Polarizability,avg_Boiling point,avg_Density,avg_Specific heat,avg_Heat of fusion,avg_Heat of vaporization,avg_Heat of atomization,avg_Thermal conductivity,avg_Thermal expansion,avg_Cohesive energy,avg_Bulk modulus,avg_Shear modulus,avg_Young's modulus,avg_Hardness Vickers,avg_Hardness Brinell,avg_Resistivity,avg_Electrical Conductivity,avg_DFT LDA Etot,avg_DFT LDA Ekin,avg_DFT LDA Ecoul,avg_DFT LDA Eenuc,avg_DFT LDA Exc,avg_DFT LSD Etot,avg_DFT LSD Ekin,avg_DFT LSD Ecoul,avg_DFT LSD Eenuc,avg_DFT LSD Exc,avg_DFT RLDA Etot,avg_DFT RLDA Ekin,avg_DFT RLDA Ecoul,avg_DFT RLDA Eenuc,avg_DFT RLDA Exc,avg_DFT ScRLDA Etot,avg_DFT ScRLDA Ekin,avg_DFT ScRLDA Ecoul,avg_DFT ScRLDA Eenuc,avg_DFT ScRLDA Exc
0,106.42,46.0,5.0,10.0,2.0,1.0,62.0,4.0,0.0,0.0,10.0,0.0,2.0,6.0,0.0,14.0,10.0,10.0,3.0,5.78,3.101,804.397946,54.031768,2.2,2.08,4.45,2.2,1.59,1.61,2.13,1.69e-10,1.31e-10,5.9e-11,1.42e-10,1.63e-10,1.4e-10,1.373e-10,1.278e-10,4.8e-30,3413.15,12000.0,0.00024,17.6,357.0,378.0,71.8,1.2e-05,375.327817,187000000000.0,44000000000.0,121000000000.0,461000000.0,37200000.0,1e-10,10000000.0,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-13221486.93,13773244.16,5401468.776,-32047348.43,-348851.4479,-13218769.46,13675220.75,5395353.24,-31941090.88,-348252.5739
1,106.42,46.0,5.0,10.0,2.0,1.0,62.0,4.0,0.0,0.0,10.0,0.0,2.0,6.0,0.0,14.0,10.0,10.0,3.0,5.78,3.101,804.397946,54.031768,2.2,2.08,4.45,2.2,1.59,1.61,2.13,1.69e-10,1.31e-10,5.9e-11,1.42e-10,1.63e-10,1.4e-10,1.373e-10,1.278e-10,4.8e-30,3413.15,12000.0,0.00024,17.6,357.0,378.0,71.8,1.2e-05,375.327817,187000000000.0,44000000000.0,121000000000.0,461000000.0,37200000.0,1e-10,10000000.0,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-13221486.93,13773244.16,5401468.776,-32047348.43,-348851.4479,-13218769.46,13675220.75,5395353.24,-31941090.88,-348252.5739
2,106.42,46.0,5.0,10.0,2.0,1.0,62.0,4.0,0.0,0.0,10.0,0.0,2.0,6.0,0.0,14.0,10.0,10.0,3.0,5.78,3.101,804.397946,54.031768,2.2,2.08,4.45,2.2,1.59,1.61,2.13,1.69e-10,1.31e-10,5.9e-11,1.42e-10,1.63e-10,1.4e-10,1.373e-10,1.278e-10,4.8e-30,3413.15,12000.0,0.00024,17.6,357.0,378.0,71.8,1.2e-05,375.327817,187000000000.0,44000000000.0,121000000000.0,461000000.0,37200000.0,1e-10,10000000.0,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-13221486.93,13773244.16,5401468.776,-32047348.43,-348851.4479,-13218769.46,13675220.75,5395353.24,-31941090.88,-348252.5739
3,106.42,46.0,5.0,10.0,2.0,1.0,62.0,4.0,0.0,0.0,10.0,0.0,2.0,6.0,0.0,14.0,10.0,10.0,3.0,5.78,3.101,804.397946,54.031768,2.2,2.08,4.45,2.2,1.59,1.61,2.13,1.69e-10,1.31e-10,5.9e-11,1.42e-10,1.63e-10,1.4e-10,1.373e-10,1.278e-10,4.8e-30,3413.15,12000.0,0.00024,17.6,357.0,378.0,71.8,1.2e-05,375.327817,187000000000.0,44000000000.0,121000000000.0,461000000.0,37200000.0,1e-10,10000000.0,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-13221486.93,13773244.16,5401468.776,-32047348.43,-348851.4479,-13218769.46,13675220.75,5395353.24,-31941090.88,-348252.5739
4,106.42,46.0,5.0,10.0,2.0,1.0,62.0,4.0,0.0,0.0,10.0,0.0,2.0,6.0,0.0,14.0,10.0,10.0,3.0,5.78,3.101,804.397946,54.031768,2.2,2.08,4.45,2.2,1.59,1.61,2.13,1.69e-10,1.31e-10,5.9e-11,1.42e-10,1.63e-10,1.4e-10,1.373e-10,1.278e-10,4.8e-30,3413.15,12000.0,0.00024,17.6,357.0,378.0,71.8,1.2e-05,375.327817,187000000000.0,44000000000.0,121000000000.0,461000000.0,37200000.0,1e-10,10000000.0,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-12957809.75,12949581.72,5334086.148,-30883430.44,-358047.1934,-13221486.93,13773244.16,5401468.776,-32047348.43,-348851.4479,-13218769.46,13675220.75,5395353.24,-31941090.88,-348252.5739


In [None]:
%%time
# Check if there are columns with NaNs
cbfv_df_test_90.isnull().sum()

CPU times: user 4.23 ms, sys: 6 µs, total: 4.23 ms
Wall time: 4.28 ms


Unnamed: 0,0
avg_Atomic weight,0
avg_Atomic number,0
avg_Period,0
avg_Group,0
avg_Quantum number ℓ,0
avg_Metal(1)/metalloid(2)/nonmetal(3),0
avg_Mendeleev_number,0
avg_Families,0
avg_Valence_s,0
avg_Valence_p,0


In [None]:
%%time
def get_elemental_block(dataframe:pd.DataFrame)->pd.DataFrame:

  """
  The function get dataframe and extract the chemical element columns
  """

  elements_in_dataset = ['Ag', 'Al', 'Au', 'B', 'Ce', 'Co', 'Cr', 'Cu', 'Dy', 'Er', 'Fe', 'Ga', 'Gd', 'Ho', 'In', 'Ir', 'La', 'Lu', 'Mn', 'Mo', 'Nb', 'Ni', 'Pb', 'Pd', 'Pr', 'Pt', 'Rh', 'Ru', 'Sm', 'Sn', 'Ta', 'Tb', 'Ti', 'Tm', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr']

  dataframe_column = dataframe.columns.to_list()

  element_block_column_names = [x for x in dataframe_column if x in elements_in_dataset]

  element_block = dataframe[element_block_column_names]

  return element_block

# Get the elements automatically
# unique_elements = list({el.symbol for comp in pd.concat([df_train_70['Composition_matrix'], df_test_70['Composition_matrix']]) for el in comp.keys()})
# sorted(unique_elements)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 9.3 µs


In [None]:
%%time
# Get elemental block
element_block_train_70=get_elemental_block(df_train_70)
element_block_test_70=get_elemental_block(df_test_70)

element_block_train_80=get_elemental_block(df_train_80)
element_block_test_80=get_elemental_block(df_test_80)

element_block_train_90=get_elemental_block(df_train_90)
element_block_test_90=get_elemental_block(df_test_90)

CPU times: user 5.27 ms, sys: 6 µs, total: 5.27 ms
Wall time: 5.07 ms


In [None]:
%%time
def drop_columns(data_list: list[pd.DataFrame], columns_to_keep: list) -> list[pd.DataFrame]:

    """
    Keeps only the specified columns in each DataFrame in a list.

    Parameters:
    ----------
    data_list : list of pd.DataFrame
        List of input DataFrames to trim.
    columns_to_keep : list
        List of column names to retain.

    Returns:
    -------
    result : list of pd.DataFrame
        List of DataFrames with only the specified columns.
    """

    result = []
    for data in data_list:
        dfc = data.copy()
        dfc = dfc.loc[:, columns_to_keep]
        result.append(dfc)
    return result

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11.9 µs


In [None]:
%%time
# Applying the function
data_list = [df_train_70, df_test_70, df_train_80, df_test_80, df_train_90, df_test_90]

columns_to_keep = ['Composition in mole percent',
                   'Thickness',
                   'Lattice parameter',
                   'Δa_ss/a_Pd',
                   'Bravais lattice',
                   'Chemical group',
                   'Stratify Group',
                   'Temperature',
                   'Pressure difference',
                   'Permeability'
                  ]

df_train_70, df_test_70, df_train_80, df_test_80, df_train_90, df_test_90 = drop_columns(data_list=data_list, columns_to_keep=columns_to_keep)

CPU times: user 9.45 ms, sys: 0 ns, total: 9.45 ms
Wall time: 9.33 ms


In [None]:
df_train_70.head()

Unnamed: 0,Composition in mole percent,Thickness,Lattice parameter,Δa_ss/a_Pd,Bravais lattice,Chemical group,Stratify Group,Temperature,Pressure difference,Permeability
0,Pd,0.00025,3.890104e-10,0.0,fcc,G10,G10,737.15,325.7194,9.24e-09
1,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,730.0685,1.32e-08
2,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,632.4555,1.26e-08
3,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,673.15,516.7204,1.26e-08
4,Pd,0.0007,3.890104e-10,0.0,fcc,G10,G10,623.15,816.7007,1.12e-08


In [None]:
%%time
# %%
# Saving the Experimental conditions dataframe
PATH_train_70= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_train_70.pkl'
PATH_test_70= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_test_70.pkl'
PATH_train_80= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_train_80.pkl'
PATH_test_80= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_test_80.pkl'
PATH_train_90= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_train_90.pkl'
PATH_test_90= '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Experimental_cond_test_90.pkl'
#%%
df_train_70.to_pickle(PATH_train_70)
df_test_70.to_pickle(PATH_test_70)
df_train_80.to_pickle(PATH_train_80)
df_test_80.to_pickle(PATH_test_80)
df_train_90.to_pickle(PATH_train_90)
df_test_90.to_pickle(PATH_test_90)

CPU times: user 17.9 ms, sys: 3 ms, total: 20.9 ms
Wall time: 2.11 s


In [None]:
# Recap
# A dataset can be created using e.g: df_train_70=experimental_properties+bond_properties+cbfv_features+elemental_block

In [None]:
%%time
#%%
#Defining the path
path_bond_properties_train_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_train_70.pkl'
path_bond_properties_test_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_test_70.pkl'
path_bond_properties_train_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_train_80.pkl'
path_bond_properties_test_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_test_80.pkl'
path_bond_properties_train_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_train_90.pkl'
path_bond_properties_test_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/bond_properties_test_90.pkl'

#%%
bond_properties_train_70.to_pickle(path_bond_properties_train_70)
bond_properties_test_70.to_pickle(path_bond_properties_test_70)
bond_properties_train_80.to_pickle(path_bond_properties_train_80)
bond_properties_test_80.to_pickle(path_bond_properties_test_80)
bond_properties_train_90.to_pickle(path_bond_properties_train_90)
bond_properties_test_90.to_pickle(path_bond_properties_test_90)

CPU times: user 13.6 ms, sys: 6.93 ms, total: 20.5 ms
Wall time: 2.43 s


In [None]:
%%time
#%%
path_cbfv_train_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_train_70.pkl'
path_cbfv_test_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_test_70.pkl'
path_cbfv_train_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_train_80.pkl'
path_cbfv_test_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_test_80.pkl'
path_cbfv_train_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_train_90.pkl'
path_cbfv_test_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/cbfv_df_test_90.pkl'

#%%
cbfv_df_train_70.to_pickle(path_cbfv_train_70)
cbfv_df_test_70.to_pickle(path_cbfv_test_70)
cbfv_df_train_80.to_pickle(path_cbfv_train_80)
cbfv_df_test_80.to_pickle(path_cbfv_test_80)
cbfv_df_train_90.to_pickle(path_cbfv_train_90)
cbfv_df_test_90.to_pickle(path_cbfv_test_90)

CPU times: user 28.1 ms, sys: 5.99 ms, total: 34.1 ms
Wall time: 2.89 s


In [None]:
%%time
#%%
#Elemental
path_element_block_train_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_train_70.pkl'
path_element_block_test_70='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_test_70.pkl'
path_element_block_train_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_train_80.pkl'
path_element_block_test_80='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_test_80.pkl'
path_element_block_train_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_train_90.pkl'
path_element_block_test_90='/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/element_block_test_90.pkl'

#%%
element_block_train_70.to_pickle(path_element_block_train_70)
element_block_test_70.to_pickle(path_element_block_test_70)
element_block_train_80.to_pickle(path_element_block_train_80)
element_block_test_80.to_pickle(path_element_block_test_80)
element_block_train_90.to_pickle(path_element_block_train_90)
element_block_test_90.to_pickle(path_element_block_test_90)

CPU times: user 16.2 ms, sys: 4.03 ms, total: 20.3 ms
Wall time: 2.48 s


In [None]:
cbfv_df_test_70.head()

Unnamed: 0,avg_Atomic weight,avg_Atomic number,avg_Period,avg_Group,avg_Quantum number ℓ,avg_Metal(1)/metalloid(2)/nonmetal(3),avg_Mendeleev_number,avg_Families,avg_Valence_s,avg_Valence_p,avg_Valence_d,avg_Valence_f,avg_Unfilled_s,avg_Unfilled_p,avg_Unfilled_d,avg_Unfilled_f,avg_No. of valence electrons,avg_Outer shell electrons,avg_Gilman no. of valence electrons,avg_Metallic valence,avg_Zeff,avg_Ionization energy,avg_Electron affinity,avg_Pauling EN,avg_Martynov Batsanov EN,avg_Mulliken EN,avg_Allred EN,avg_Allred Rockow EN,avg_Nagle EN,avg_Ghosh EN,avg_Atomic radius calculated,avg_Covalent radius,avg_Effective ionic radius,avg_Miracle radius,avg_van der Waals radius,avg_Slater radius,"avg_Pauling, R(CN12)","avg_Pauling, R(1)",avg_Polarizability,avg_Boiling point,avg_Density,avg_Specific heat,avg_Heat of fusion,avg_Heat of vaporization,avg_Heat of atomization,avg_Thermal conductivity,avg_Thermal expansion,avg_Cohesive energy,avg_Bulk modulus,avg_Shear modulus,avg_Young's modulus,avg_Hardness Vickers,avg_Hardness Brinell,avg_Resistivity,avg_Electrical Conductivity,avg_DFT LDA Etot,avg_DFT LDA Ekin,avg_DFT LDA Ecoul,avg_DFT LDA Eenuc,avg_DFT LDA Exc,avg_DFT LSD Etot,avg_DFT LSD Ekin,avg_DFT LSD Ecoul,avg_DFT LSD Eenuc,avg_DFT LSD Exc,avg_DFT RLDA Etot,avg_DFT RLDA Ekin,avg_DFT RLDA Ecoul,avg_DFT RLDA Eenuc,avg_DFT RLDA Exc,avg_DFT ScRLDA Etot,avg_DFT ScRLDA Ekin,avg_DFT ScRLDA Ecoul,avg_DFT ScRLDA Eenuc,avg_DFT ScRLDA Exc
137,107.575214,46.7978,5.0,10.7978,0.4044,1.0,64.3934,4.0,0.7978,0.0,10.0,0.0,1.2022,6.0,0.0,14.0,10.7978,2.8198,2.2022,5.508748,3.602816,745.819231,111.455843,1.984594,1.274222,4.442022,1.984594,1.813384,1.482352,2.32945,1.658088e-10,1.485516e-10,1.036768e-10,1.435956e-10,1.701802e-10,1.55956e-10,1.428048e-10,1.326666e-10,7.273179999999999e-30,2672.7916,10804.0978,0.000236,12.57386,272.098124,303.0068,356.77416,1.7e-05,302.970403,120782600000.0,32830800000.0,92279200000.0,293462000.0,27067940.0,3.29848e-11,51485600.0,-13501710.0,13493310.0,5543491.0,-32170410.0,-368103.145434,-13501720.0,13493330.0,5543666.0,-32170590.0,-368121.404284,-13786390.0,14383860.0,5616967.0,-33428770.0,-358446.151226,-13783350.0,14277140.0,5610360.0,-33313040.0,-357810.690517
138,107.575214,46.7978,5.0,10.7978,0.4044,1.0,64.3934,4.0,0.7978,0.0,10.0,0.0,1.2022,6.0,0.0,14.0,10.7978,2.8198,2.2022,5.508748,3.602816,745.819231,111.455843,1.984594,1.274222,4.442022,1.984594,1.813384,1.482352,2.32945,1.658088e-10,1.485516e-10,1.036768e-10,1.435956e-10,1.701802e-10,1.55956e-10,1.428048e-10,1.326666e-10,7.273179999999999e-30,2672.7916,10804.0978,0.000236,12.57386,272.098124,303.0068,356.77416,1.7e-05,302.970403,120782600000.0,32830800000.0,92279200000.0,293462000.0,27067940.0,3.29848e-11,51485600.0,-13501710.0,13493310.0,5543491.0,-32170410.0,-368103.145434,-13501720.0,13493330.0,5543666.0,-32170590.0,-368121.404284,-13786390.0,14383860.0,5616967.0,-33428770.0,-358446.151226,-13783350.0,14277140.0,5610360.0,-33313040.0,-357810.690517
139,75.756515,33.8416,4.2848,10.7152,2.0,1.0,63.4304,4.0,0.7152,0.0,10.0,0.0,1.2848,6.0,0.0,14.0,10.7152,3.5632,2.2848,5.536832,3.038062,762.235105,100.542005,1.98544,1.3648,4.471456,1.98544,1.775952,1.53848,2.931024,1.518352e-10,1.360064e-10,7.18736e-11,1.31272e-10,1.465504e-10,1.36424e-10,1.303626e-10,1.202904e-10,6.15888e-30,3003.3404,9806.4816,0.00034,14.34584,316.44816,349.392,307.24384,1.5e-05,347.725302,151955200000.0,46860800000.0,127436800000.0,395201600.0,635679360.0,4.06384e-11,45044800.0,-6765749.0,6760020.0,2859732.0,-16162660.0,-222838.88372,-6765763.0,6760038.0,2859866.0,-16162810.0,-222859.099635,-6865489.0,7069790.0,2885720.0,-16602420.0,-218584.194473,-6864617.0,7034191.0,2883384.0,-16563860.0,-218328.689137
140,75.756515,33.8416,4.2848,10.7152,2.0,1.0,63.4304,4.0,0.7152,0.0,10.0,0.0,1.2848,6.0,0.0,14.0,10.7152,3.5632,2.2848,5.536832,3.038062,762.235105,100.542005,1.98544,1.3648,4.471456,1.98544,1.775952,1.53848,2.931024,1.518352e-10,1.360064e-10,7.18736e-11,1.31272e-10,1.465504e-10,1.36424e-10,1.303626e-10,1.202904e-10,6.15888e-30,3003.3404,9806.4816,0.00034,14.34584,316.44816,349.392,307.24384,1.5e-05,347.725302,151955200000.0,46860800000.0,127436800000.0,395201600.0,635679360.0,4.06384e-11,45044800.0,-6765749.0,6760020.0,2859732.0,-16162660.0,-222838.88372,-6765763.0,6760038.0,2859866.0,-16162810.0,-222859.099635,-6865489.0,7069790.0,2885720.0,-16602420.0,-218584.194473,-6864617.0,7034191.0,2883384.0,-16563860.0,-218328.689137
141,75.756515,33.8416,4.2848,10.7152,2.0,1.0,63.4304,4.0,0.7152,0.0,10.0,0.0,1.2848,6.0,0.0,14.0,10.7152,3.5632,2.2848,5.536832,3.038062,762.235105,100.542005,1.98544,1.3648,4.471456,1.98544,1.775952,1.53848,2.931024,1.518352e-10,1.360064e-10,7.18736e-11,1.31272e-10,1.465504e-10,1.36424e-10,1.303626e-10,1.202904e-10,6.15888e-30,3003.3404,9806.4816,0.00034,14.34584,316.44816,349.392,307.24384,1.5e-05,347.725302,151955200000.0,46860800000.0,127436800000.0,395201600.0,635679360.0,4.06384e-11,45044800.0,-6765749.0,6760020.0,2859732.0,-16162660.0,-222838.88372,-6765763.0,6760038.0,2859866.0,-16162810.0,-222859.099635,-6865489.0,7069790.0,2885720.0,-16602420.0,-218584.194473,-6864617.0,7034191.0,2883384.0,-16563860.0,-218328.689137
