<a href="https://colab.research.google.com/github/AldoKwamibar/Pd-membranes-permeability/blob/main/Palladium_Membrane_ML_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# """
# This section is a material informatics project aiming to build a quantitative composition-structure-property (QSPR) relationship between palladium-based alloys membranes and their hydrogen permeability.
# ...
# """

In [None]:
%%time
# Utilities to visualize periodic table and Element Counts in a material Informatics database
!pip install pymatgen matminer

CPU times: user 1.9 s, sys: 312 ms, total: 2.21 s
Wall time: 5.2 s


In [None]:
%%time
# Import useful packages and dependencies
import os
import pandas as pd
import numpy as np
import scipy as sc
import re
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format='retina' #Allow image to have high resolution
np.random.seed(42) # Ensure reproducibility

CPU times: user 3.87 ms, sys: 999 µs, total: 4.87 ms
Wall time: 5.14 ms


In [None]:
%%time
from pymatgen.core import Element, Composition
from matminer.featurizers.composition import ElementFraction
from matminer.featurizers.conversions import StrToComposition

CPU times: user 24 µs, sys: 0 ns, total: 24 µs
Wall time: 28.1 µs


In [None]:
%%time
# Mounting google drive to import and export file from and to it
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CPU times: user 28.6 ms, sys: 3.07 ms, total: 31.7 ms
Wall time: 2.31 s


In [None]:
# def  create_folder(path):
#   if not os.path.exists(path):
#     os.makedirs(path)
#     print(f'Directory {path} created.')
#   else:
#     print(f'Directory {path} already exists.')
# create_folder('/content/drive/Dataset_MaterialInformatics_PhD')
# create_folder('/content/drive/result_MaterialInformatics_PhD')

# os.chdir('/content/drive/Colab Notebooks/')
# print('Work space:', os.getcwd())

In [None]:
%%time
# Setting this options allow Pandas to display every columns. By default only few are displayed.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

CPU times: user 74 µs, sys: 1e+03 ns, total: 75 µs
Wall time: 78.7 µs


In [None]:
%%time
data_path = '/content/drive/MyDrive/PhD_Pd alloy prediction/Palladium_Alloy_Membranes dataset_QSPR_ML.csv'
raw_data_1 = pd.read_csv(data_path)
print(f'Original DataFrame shape: {raw_data_1.shape}')

Original DataFrame shape: (2798, 32)
CPU times: user 99.3 ms, sys: 4.04 ms, total: 103 ms
Wall time: 105 ms


In [None]:
%%time
raw_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2798 entries, 0 to 2797
Data columns (total 32 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Alloy Formula                                                 2419 non-null   object 
 1   Composition
 type                                             2419 non-null   object 
 2   Form                                                          2419 non-null   object 
 3   Composition in mole percent                                   2419 non-null   object 
 4   Thickness 
(micron)                                           2419 non-null   object 
 5   fcc lattice parameter for Cu alloys                           5 non-null      object 
 6   Bravais lattice in as-annealed form for PdCu based membranes  1 non-null      float64
 7   Latice parameter (nm)                                         2136 no

In [None]:
# Displaying columns-name-corrected dataset
%%time
raw_data_1.head()

CPU times: user 162 µs, sys: 0 ns, total: 162 µs
Wall time: 167 µs


Unnamed: 0,Alloy Formula,Composition\n type,Form,Composition in mole percent,Thickness \n(micron),fcc lattice parameter for Cu alloys,Bravais lattice in as-annealed form for PdCu based membranes,Latice parameter (nm),Bravais lattice in as-manufactured state,Temperature (°C),Inverse temperature \n1E3/T (1E3 K-1),Diffusion \ncoefficient\n(cm^2.s^-1),Fabrication technique,Pressure \ndifference P1^n- P2^n (Pa^n),Pressure \nexponent \n(n),Effective area (cm^2),Membrane support,Type of support,Gases,Solubility constant Ks (Pa^-0.5),Hydrogen permeability (mol.m^-1.s^-1.Pa^-n),Hydrogen Permeation Flux\n(mol.m^-2.s^-1),Permeation \nActivation Energy (KJ/mol),Hydrogen Permeance (mol.m^-2.s^-1.Pa^-0.5),Feed Flow rate (mL.min^-1),Hydrogen transfer rate s.c.f.h.(mil).(sq.ft.)^-1,Conversion factor,Number,General comments,Reference,DOI/URL,Unnamed: 31
0,Pd65.1Cu34.9,mole fraction,foil,Pd65.10Cu34.90,97,,,0.3805,fcc,300.0,1.75,1.98e-05,arc melting/cold rolling/CFUBMSIP (Close Field...,469.902,0.5,2.54,No,,H2',,1.46e-09,,,,,,,1,"In inverse T, the T unit is 1E-3K. The permeab...",(Nayebossadri et al.2017),http://dx.doi.org/10.1021/acsami.6b12752,Use this link for unit conversion https://www....
1,Pd65.1Cu34.9,mole fraction,foil,Pd65.10Cu34.90,97,,,0.3805,fcc,325.0,1.67,2.6e-05,arc melting/cold rolling/CFUBMSIP (Close Field...,469.902,0.5,2.54,No,,H2',,1.73e-09,,,,,,,1,"In inverse T, the T unit is 1E-3K. The permeab...",(Nayebossadri et al.2017),http://dx.doi.org/10.1021/acsami.6b12752,
2,Pd65.1Cu34.9,mole fraction,foil,Pd65.10Cu34.90,97,,,0.3805,fcc,350.0,1.61,3.07e-05,arc melting/cold rolling/CFUBMSIP (Close Field...,469.902,0.5,2.54,No,,H2',,2.08e-09,,,,,,,1,"In inverse T, the T unit is 1E-3K. The permeab...",(Nayebossadri et al.2017),http://dx.doi.org/10.1021/acsami.6b12752,
3,Pd65.1Cu34.9,mole fraction,foil,Pd65.10Cu34.90,97,,,0.3805,fcc,375.0,,,arc melting/cold rolling/CFUBMSIP (Close Field...,469.902,0.5,2.54,No,,H2',,2.39e-09,,,,,,,1,"In inverse T, the T unit is 1E-3K. The permeab...",(Nayebossadri et al.2017),http://dx.doi.org/10.1021/acsami.6b12752,
4,Pd65.1Cu34.9,mole fraction,foil,Pd65.10Cu34.90,97,,,0.3805,fcc,400.0,1.49,4.24e-05,arc melting/cold rolling/CFUBMSIP (Close Field...,469.902,0.5,2.54,No,,H2',,2.81e-09,,,,,,,1,"In inverse T, the T unit is 1E-3K. The permeab...",(Nayebossadri et al.2017),http://dx.doi.org/10.1021/acsami.6b12752,


In [None]:
%%time
raw_data_1.describe(include=[np.number])

CPU times: user 22.6 ms, sys: 997 µs, total: 23.6 ms
Wall time: 28.9 ms


Unnamed: 0,Bravais lattice in as-annealed form for PdCu based membranes,Latice parameter (nm),Temperature (°C),Inverse temperature \n1E3/T (1E3 K-1),Pressure \ndifference P1^n- P2^n (Pa^n),Pressure \nexponent \n(n),Solubility constant Ks (Pa^-0.5),Hydrogen Permeance (mol.m^-2.s^-1.Pa^-0.5),Feed Flow rate (mL.min^-1),Hydrogen transfer rate s.c.f.h.(mil).(sq.ft.)^-1,Conversion factor
count,1.0,2136.0,2419.0,211.0,2419.0,2420.0,1.0,30.0,24.0,67.0,13.0
mean,99.99,0.383902,348.298929,2.678246,5504.194709,0.518273,1.03e-08,4.01122e-05,0.533333,63.280597,12.85635
std,,0.024859,136.523592,27.500045,41597.726788,0.095637,,3.253153e-05,0.376097,65.824331,46.21441
min,99.99,0.2874,19.18,0.0,6.5,0.5,1.03e-08,3e-09,0.1,0.0,1.24e-08
25%,99.99,0.3885,250.0,0.0,223.6923,0.5,1.03e-08,9.5475e-06,0.1,12.0,1.24e-08
50%,99.99,0.39165,350.0,0.0,350.8554,0.5,1.03e-08,3.07e-05,0.5,48.0,1.41e-05
75%,99.99,0.3935,440.21,1.61,570.692,0.5,1.03e-08,7.465e-05,1.0,83.5,0.007431273
max,99.99,0.4041,901.41,400.0,704197.226,1.23,1.03e-08,9.53e-05,1.0,294.0,166.6667


In [None]:
# Renaming some column names in the dataset to remove newline '\n' sign
%%time
rename_dict = {'Composition\n type': 'Composition type',
              'Thickness \n(micron)':'Thickness (micron)',
              'Latice parameter (nm)': 'Lattice parameter (nm)',
              'Diffusion \ncoefficient\n(cm^2.s^-1)': 'Diffusion coefficient(cm^2.s^-1)',
              raw_data_1.columns.values[13]: 'Pressure difference P1^n-P2^n (Pa^n)',
              'Inverse temperature \n1E3/T (1E3 K-1)': 'Inverse temperature (1000/T in (1000 K^-1))',
              raw_data_1.columns.values[14]: 'Pressure exponent (n)',
              'Hydrogen Permeation Flux\n(mol.m^-2.s^-1)': 'Hydrogen Permeation Flux (mol.m^-2.s^-1)',
              'Permeation \nActivation Energy (KJ/mol)': 'Permeation Activation Energy (KJ/mol)',
              'Bravais lattice in as-manufactured state':'Bravais lattice'}
raw_data_1=raw_data_1.rename(columns=rename_dict)

CPU times: user 1.39 ms, sys: 0 ns, total: 1.39 ms
Wall time: 1.4 ms


In [None]:
# Dropping some columns
%%time
df = raw_data_1.copy()
columns_to_drop = [#Not sufficient data were recovered from the literature to populate this columns (diffusivity, solubility, Activation Energy, etc.)
                  #They would have been very helpfull.
    'fcc lattice parameter for Cu alloys', 'Bravais lattice in as-annealed form for PdCu based membranes', 'Inverse temperature (1000/T in (1000 K^-1))', 'Diffusion coefficient(cm^2.s^-1)', 'Effective area (cm^2)', 'Membrane support', 'Type of support', 'Gases', 'Solubility constant Ks (Pa^-0.5)', 'Hydrogen Permeation Flux (mol.m^-2.s^-1)', 'Permeation Activation Energy (KJ/mol)', 'Hydrogen Permeance (mol.m^-2.s^-1.Pa^-0.5)', 'Feed Flow rate (mL.min^-1)', 'Hydrogen transfer rate s.c.f.h.(mil).(sq.ft.)^-1', 'Conversion factor', 'Number','Unnamed: 31']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
raw_data_1_col_dropped = df

CPU times: user 1.62 ms, sys: 579 µs, total: 2.2 ms
Wall time: 2.21 ms


In [None]:
# Ckecking NaN in the 'Alloy Formula' column. If there are unfilled rows in that column, so they are considered null and can be removed.
%%time
df2 = raw_data_1_col_dropped.copy()
bool_nans_formula=df2['Alloy Formula'].isnull()
df2 = df2.drop(df2.loc[bool_nans_formula].index, axis=0).reset_index(drop=True)
raw_data_1_col_nan_dropped=df2
# Drop the rows of the DataFrame which contain NaNs
print(f'DataFrame shape before dropping NaNs: {raw_data_1_col_dropped.shape}')
print(f'DataFrame shape after NaNs are dropped: {df2.shape}')
print(f'raw_data_1_col_nan_dropped shape after dropping NaNs: {raw_data_1_col_nan_dropped.shape}')

DataFrame shape before dropping NaNs: (2798, 15)
DataFrame shape after NaNs are dropped: (2419, 15)
raw_data_1_col_nan_dropped shape after dropping NaNs: (2419, 15)
CPU times: user 2.94 ms, sys: 0 ns, total: 2.94 ms
Wall time: 2.87 ms


In [None]:
# Changing data types: Here, for example the thickness that should be a numeric is in object type
%%time
raw_data_1_col_nan_dropped.dtypes

CPU times: user 238 µs, sys: 3 µs, total: 241 µs
Wall time: 528 µs


Unnamed: 0,0
Alloy Formula,object
Composition type,object
Form,object
Composition in mole percent,object
Thickness (micron),object
Lattice parameter (nm),float64
Bravais lattice,object
Temperature (°C),float64
Fabrication technique,object
Pressure difference P1^n-P2^n (Pa^n),float64


In [None]:
# Converting data types to the proper category. Numerical datatypes are converted to numerics np.float64 and string types remains 'str'
%%time
type_dict={'Alloy Formula':str,
           'Composition type':str,
           'Form':str,
           'Composition in mole percent': str,
           'Thickness (micron)': np.float64,
           'Lattice parameter (nm)':np.float64,
           'Bravais lattice':str,
           'Temperature (°C)':np.float64,
           'Fabrication technique':str,
           'Pressure difference P1^n-P2^n (Pa^n)':np.float64,
           'Pressure exponent (n)':np.float64,
           'Hydrogen permeability (mol.m^-1.s^-1.Pa^-n)':np.float64}

raw_data_1_col_nan_dropped = raw_data_1_col_nan_dropped.astype(type_dict)

CPU times: user 3.55 ms, sys: 0 ns, total: 3.55 ms
Wall time: 3.56 ms


In [None]:
# Checking if everything went correctly
%%time
raw_data_1_col_nan_dropped.dtypes

CPU times: user 377 µs, sys: 0 ns, total: 377 µs
Wall time: 393 µs


Unnamed: 0,0
Alloy Formula,object
Composition type,object
Form,object
Composition in mole percent,object
Thickness (micron),float64
Lattice parameter (nm),float64
Bravais lattice,object
Temperature (°C),float64
Fabrication technique,object
Pressure difference P1^n-P2^n (Pa^n),float64


In [None]:
# The columns ['Composition type', 'Form', 'Bravais lattice in as-manufactured state', 'Fabrication technique'] contains some inconsitencies
#  Homogenizing the Composition type column
%%time
print(raw_data_1_col_nan_dropped['Composition type'].unique())

ctinstance_dict={'mole fraction':'mole percent',
                 'mole fraction ':'mole percent',
                 'weight fraction':'weight percent',
                 'mole fractiion': 'mole percent',
                 'weight fraction ': 'weight percent',
                 'weight':'weight percent'}

raw_data_1_col_nan_dropped['Composition type'] = raw_data_1_col_nan_dropped['Composition type'].replace(ctinstance_dict)
print(raw_data_1_col_nan_dropped['Composition type'].unique())
# The nan represent empty lines

['mole fraction' 'weight fraction' 'mole fraction ' 'mole fractiion'
 'weight fraction ' 'weight']
['mole percent' 'weight percent']
CPU times: user 4.27 ms, sys: 0 ns, total: 4.27 ms
Wall time: 6.28 ms


In [None]:
# Homogenizing the Form column
%%time
print(raw_data_1_col_nan_dropped['Form'].unique())

raw_data_1_col_nan_dropped['Form'] = raw_data_1_col_nan_dropped['Form'].replace({'foils':'foil',
                                                                                 'Self-standing':'self-standing foil',
                                                                                 'Condensed foils':'condensed foil',
                                                                                 'free-hanging':'free-hanging foil',
                                                                                 'foils//ecthed foils':'untouched foil and etched foil',
                                                                                 'foils on silicon wafer': 'foil prepared on Silicon wafer',
                                                                                 'foils/self-standing': 'self-standing foil'})

# Replace the nan values with 'foil'
raw_data_1_col_nan_dropped.loc[raw_data_1_col_nan_dropped['Form'].isnull(), 'Form'] = 'foil'
# Reprint the unique values
print(raw_data_1_col_nan_dropped['Form'].unique())

['foil' 'free-hanging' 'Self-standing' 'disc' 'Condensed foils' 'tube'
 'foils/self-standing' 'foils//ecthed foils' 'foils on silicon wafer']
['foil' 'free-hanging foil' 'self-standing foil' 'disc' 'condensed foil'
 'tube' 'untouched foil and etched foil' 'foil prepared on Silicon wafer']
CPU times: user 4.98 ms, sys: 0 ns, total: 4.98 ms
Wall time: 5.18 ms


In [None]:
# Homogenizing the Fabrication technique column
%%time

raw_data_1_col_nan_dropped['Fabrication technique'] = raw_data_1_col_nan_dropped['Fabrication technique'].str.capitalize()

raw_data_1_col_nan_dropped['Fabrication technique'] = raw_data_1_col_nan_dropped['Fabrication technique'].replace({
    'Arc melting in inert atmoosphere/cold rolling/intermediate vacuum annealing':'Arc melting in inert atmosphere/Cold rolling/intermediate vacuum annealing',
    'Commercial alfa aesar': 'Commercial Alfa Aesar',
    'Commercial/goodfellow co.':'Commercial Goodfellow Co.',
    'Melting casrting an cold rolling':'Melt casting and cold rolling',
    'Pellet from commercial pd powder/cold rolling':'Pellet from commercial Pd powder/Cold rolling',
    'Arc melting/cold rolling/cfubmsip (close field unballanced magnetron sputtering ion plating)':'Arc melting/cold rolling/CFUBMSIP (Close Field Unballanced Magnetron Sputtering Ion Plating)',
    'Dc magnetron sputtering':'DC-magnetron sputtering'})

print(raw_data_1_col_nan_dropped['Fabrication technique'].unique())

['Arc melting/cold rolling/CFUBMSIP (Close Field Unballanced Magnetron Sputtering Ion Plating)'
 'Cold rolling into foils from cast ingots of commercial alloys'
 'Cold working' 'Magnetron sputtering' 'Cold rolling' 'Metallurgical'
 'Microtechnology/co-sputtering' 'Electroless plating' 'Commercial'
 'Microsystem/sputtering' 'Arc melting/cold working'
 'Arc melting in inert atmosphere/Cold rolling/intermediate vacuum annealing'
 'Arc melting/cold rolling' 'Magnetron sputtering/annealing'
 'Magnetron-sputtering techniques/annealing'
 'Arc melting/vacuum annealing/cold rolling'
 'Arc melting in helium/vacuum annealing/cold rolling'
 'Pulsed electrodeposition/nanostrutured alloys' 'Melting/cold rolling'
 'DC-magnetron sputtering'
 'Arc melting/cold rolling in nitrogen or vacuum'
 'Arc melting/annealing/rolling'
 'Cold rolling/quenching at high temperature' 'Commercial Goodfellow Co.'
 'Arc melting/rolling' 'Metallurgical/cold rolling'
 'Vacuum-arc melting/ rolling' 'Arc melting/ annealing/r

In [None]:
# Homogenizing the Bravais lattice in as-manufactured state columns
%%time
raw_data_1_col_nan_dropped['Bravais lattice']=raw_data_1_col_nan_dropped['Bravais lattice'].replace({'bcc + fcc': 'bcc/fcc', 'fcc+bcc': 'bcc/fcc', 'fcc+bcc/bcc': 'bcc/fcc', 'bcc+fcc':'bcc/fcc'})
print(raw_data_1_col_nan_dropped['Bravais lattice'].unique())
# The nan values belongs to PdCu alloys, and we will be calculated those later on automatically

['fcc' 'bcc' 'bcc/fcc' 'nan']
CPU times: user 3.24 ms, sys: 0 ns, total: 3.24 ms
Wall time: 3.19 ms


In [None]:
%%time
# We are going to parse a little bit the formula to make them clean and ready for conversion to composition object using pymatgen or matminer packages
def clean_formula(formula):
    if isinstance(formula, str):
        # Normalize all unicode whitespace
        formula = ''.join(c for c in formula if not unicodedata.category(c).startswith('Z'))
        formula = re.sub(r"[^A-Za-z0-9.]", "", formula)
        return formula
    return formula

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [None]:
%%time
# Applying the formula
raw_data_1_col_nan_dropped[['Composition in mole percent', 'Alloy Formula']] = raw_data_1_col_nan_dropped[['Composition in mole percent', 'Alloy Formula']].apply(clean_formula)

CPU times: user 3.06 ms, sys: 1 µs, total: 3.06 ms
Wall time: 3.17 ms


In [None]:
%%time
raw_data_1_col_nan_dropped['Composition in mole percent'].unique()

CPU times: user 657 µs, sys: 0 ns, total: 657 µs
Wall time: 523 µs


array(['Pd65.10Cu34.90', 'Pd61.80Cu36.40Ti1.80', 'Pd61.10Cu37.20Nb1.70',
       'Pd61.10Cu36.30Ni2.60', 'Pd69.00Cu29.90V1.10',
       'Pd61.30Cu37.10Y1.60', 'Pd61.00Cu37.20Zr1.80', 'Pd46.60Cu53.40',
       'Pd53.10Cu46.90', 'Pd45.80Cu51.90Ag2.30', 'Pd45.10Cu51.00Ag3.90',
       'Pd', 'Pd90.13Ag9.87', 'Pd73.28Ag26.72', 'Pd60.34Ag39.66',
       'Pd48.35Ag51.65', 'Pd20.22Ag79.78', 'Pd97.24Au2.76',
       'Pd88.10Au11.90', 'Pd73.53Au26.47', 'Pd60.24Au39.76',
       'Pd95.29B4.71', 'Pd84.31Cu15.69', 'Pd47.25Cu52.75',
       'Pd83.23Ni16.77', 'Pd58.22Cu41.78', 'Pd49.89Cu50.11',
       'Pd45.20Cu54.80', 'Pd42.20Cu57.80', 'Pd32.83Cu67.17',
       'Pd28.48Cu71.52', 'Pd12.99Cu87.01', 'Pd6.22Cu93.78',
       'Pd94.20Cu5.80', 'Pd95.10Ru4.90', 'Pd94.70Mo5.30', 'Pd95.00Nb5.00',
       'Pd94.80Ta5.20', 'Pd94.90Au5.10', 'Pd95.00Y5.00', 'Pd70.00Cu30.00',
       'Pd70.00Cu26.00Ru4.00', 'Pd75.00Cu24.00Mo1.00',
       'Pd70.00Cu26.00Mo4.00', 'Pd73.00Cu26.00Ta1.00',
       'Pd70.00Cu26.00Ta4.00', 'Pd72.00C

In [None]:
# Convert The Formula from weight percent to mole percent
# Define atomic weights globally for reuse
# Reference for periodic table: https://periodic.lanl.gov/index.shtml
# Reference on regular expressions with re module: https://docs.python.org/3/library/re.html
# Reference for formula to convert weight percent to atomic percent: https://plasmaterials.com/converting-atomic-percent-to-weight-percent-and-vice-versa/
%%time
ATOMIC_WEIGHTS = {
    'B': 10.81, 'Sc': 44.96, 'Ti': 47.88, 'V': 50.94, 'Cr': 52.00, 'Mn': 54.94, 'Fe': 55.85,
    'Co': 58.93, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.39, 'Ga': 69.72, 'Y': 88.91, 'Zr': 91.22,
    'Nb': 92.91, 'Mo': 95.96, 'Tc': 98, 'Ru': 101.1, 'Rh': 102.9, 'Pd': 106.4, 'Ag': 107.9,
    'Cd': 112.4, 'In': 114.8, 'Sn': 118.7, 'Pb': 207.2, 'La': 138.9, 'Ce': 140.1, 'Pr': 140.9,
    'Nd': 144.2, 'Pm': 145, 'Sm': 150.4, 'Eu': 152.0, 'Gd': 157.2, 'Tb': 158.9, 'Dy': 162.5,
    'Ho': 164.9, 'Er': 167.3, 'Tm': 168.9, 'Yb': 173.0, 'Lu': 175.0, 'Hf': 178.5, 'Ta': 180.9,
    'W': 183.9, 'Re': 186.2, 'Os': 190.2, 'Ir': 192.2, 'Pt': 195.1, 'Au': 197.0, 'Al':26.98
}

def weight_percent_to_atomic_percent(formula: str) -> str: #Only formula in the form AxBy... are handled (Can be binary, ternary or more). Make sure x,y are weight  percentage
    """
    Convert a chemical formula from weight percentage (e.g., 'Pd60.00Cu40.00')
    to atomic percent (e.g., 'Pd47.25Cu52.75').

    Parameters:
        formula (str): Chemical formula with weight percentages.

    Returns:
        str: Reformatted chemical formula in atomic percent.
    """
    # Extract elements and their weight percentages
    tokens = re.findall(r'[A-Z][a-z]?|\d+\.?\d*', formula) # This returns a list of the element symbols and weight percentage

    # Build a dictionary with the key equal the element symbol and the values equal to the weight percentage and a atomic weight: {element:(weight percent, atomic weight) ...}
    element_data = {}
    for i in range(0, len(tokens) - 1, 2):
        element = tokens[i]
        weight_percent = float(tokens[i + 1])
        atomic_weight = ATOMIC_WEIGHTS.get(element) #get the values i.e atomic weight for the elements

        if atomic_weight is None:
            raise ValueError(f"Atomic weight for element '{element}' is not defined.")

        element_data[element] = (weight_percent, atomic_weight)

    # Convert weight % to atomic %
    denominator = sum(wt / at_wt for wt, at_wt in element_data.values())
    atomic_percent = {
        element: round((wt / at_wt) / denominator * 100, 2)
        for element, (wt, at_wt) in element_data.items()
    }

    # Create output string in the form: Pd47.25Cu52.75
    output = ''.join(f"{el}{val}" for el, val in atomic_percent.items())
    return output

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 22.9 µs


In [None]:
%%time
# Copy original dataframe
df3 = raw_data_1_col_nan_dropped.copy()

# Step 1: Separate before modifying
df3_weight = df3[df3['Composition type'] == 'weight percent'].copy()
df3_atomic = df3[df3['Composition type'] == 'mole percent'].copy()

# Step 2: Convert weight percent --> atomic percent
df3_weight['Composition in mole percent'] = df3_weight['Alloy Formula'].apply(weight_percent_to_atomic_percent)
df3_weight['Composition type'] = 'weight percent'

# Step 3: Use existing mole percent directly
df3_atomic['Composition in mole percent'] = df3_atomic['Alloy Formula']

# Step 4: Concatenate the two dataframes
df3_cleaned = pd.concat([df3_atomic, df3_weight], axis=0, ignore_index=True)

# sort values by ascending order (default) and reset index. This means we will have PdAg, PdAu and PdB, etc. before PdCe, PdCu etc.
df4 = df3_cleaned.sort_values(by='Composition in mole percent').reset_index(drop=True)
raw_data_1_col_nan_dropped=df4

CPU times: user 24.2 ms, sys: 992 µs, total: 25.2 ms
Wall time: 30 ms


In [None]:
# We reparse the formulas for security
%%time
raw_data_1_col_nan_dropped['Composition in mole percent'] = raw_data_1_col_nan_dropped['Composition in mole percent'].apply(clean_formula)
raw_data_1_col_nan_dropped['Composition in mole percent'].unique()

CPU times: user 21.5 ms, sys: 0 ns, total: 21.5 ms
Wall time: 25.4 ms


array(['Pd', 'Pd12.99Cu87.01', 'Pd20.22Ag79.78', 'Pd28.48Cu71.52',
       'Pd32.83Cu67.17', 'Pd33.0Ag59.0Ni8.0', 'Pd38.62Cu59.41Al1.97',
       'Pd38.89Cu60.58Al0.53', 'Pd40.0Cu49.0Au11.0', 'Pd40.0Cu53.0Au7.0',
       'Pd40.0Cu57.0Au3.0', 'Pd40.0Cu60.0', 'Pd40.21Cu57.86Al1.93',
       'Pd40.25Cu59.75', 'Pd40.57Cu57.18Al2.25', 'Pd41.00Cu58.51Al0.49',
       'Pd41.02Cu58.78Al0.20', 'Pd41.0Cu57Al2.0', 'Pd41.0Cu58.2Al0.8',
       'Pd41.0Cu58.8Al0.2', 'Pd41.22Cu58.78', 'Pd41.30Cu58.70',
       'Pd41.38Cu58.57Al0.05', 'Pd42.2Cu57.8', 'Pd43.0Cu56.2Al0.8',
       'Pd43.0Cu57.0', 'Pd43.19Cu56.81', 'Pd43.24Cu56.76',
       'Pd44.50Cu55.00Al0.5', 'Pd44.66Cu55.34', 'Pd44.74Cu52.10Al3.16',
       'Pd44.77Cu53.20Al2.03', 'Pd44.99Cu55.01', 'Pd44.9Cu55.1',
       'Pd45.0Cu53Al2.0', 'Pd45.0Cu53Ga2.0', 'Pd45.0Cu53In2.0',
       'Pd45.0Cu54.2Al0.8', 'Pd45.0Cu54.2Ga0.8', 'Pd45.0Cu54.2In0.8',
       'Pd45.0Cu54.8Al0.2', 'Pd45.0Cu54.8Ga0.2', 'Pd45.0Cu54.8In0.2',
       'Pd45.1Cu51.0Ag3.9', 'Pd45.1Cu51Ag3.9'

In [None]:
# Removing invalid n values i.e n>1 because they are not a frequent trend from the literature
%%time
df5 = raw_data_1_col_nan_dropped.copy()
bool_invalid_n = df5['Pressure exponent (n)']>0.5
print(f'The number of datapoints with n>0.5 is {raw_data_1_col_nan_dropped[bool_invalid_n].shape[0]}')

df5 = df5.drop(df5.loc[bool_invalid_n].index, axis = 0).reset_index(drop=True)
raw_data_1_col_nan_dropped = df5

print(f'The shape of the dataset with n<=0.5 is {raw_data_1_col_nan_dropped.shape[0]}')

The number of datapoints with n>0.5 is 131
The shape of the dataset with n<=0.5 is 2288
CPU times: user 3.92 ms, sys: 0 ns, total: 3.92 ms
Wall time: 4.04 ms


In [None]:
# Remove Study by Li because it was done using plasma heating, and El Shafie
%%time
df6=raw_data_1_col_nan_dropped.copy()
bool_invalid_ref = df6['Reference'].isin(['(Li et al., 2023)', '(El Shafie et al., 2020) '])
print(f'Total number of points in refs (Li et al., 2023) and (El Shafie et al., 2020) are: {raw_data_1_col_nan_dropped[bool_invalid_ref].shape[0]}')
print(f'The shape of the dataset before refs (Li et al., 2023) and (El Shafie et al., 2020) were removed is: {raw_data_1_col_nan_dropped.shape[0]}')
df6=df6.drop(df6.loc[bool_invalid_ref].index, axis = 0).reset_index(drop=True)
raw_data_1_col_nan_dropped=df6
print(f'The shape of the dataset after the removal of refs (Li et al., 2023) and (El Shafie et al., 2020): {raw_data_1_col_nan_dropped.shape[0]}')

Total number of points in refs (Li et al., 2023) and (El Shafie et al., 2020) are: 24
The shape of the dataset before refs (Li et al., 2023) and (El Shafie et al., 2020) were removed is: 2288
The shape of the dataset after the removal of refs (Li et al., 2023) and (El Shafie et al., 2020): 2264
CPU times: user 3.95 ms, sys: 991 µs, total: 4.94 ms
Wall time: 5.12 ms


In [None]:
# Remove Study with hydrogen permeability >=1.0e-7
# Those data belongs to (Ievlev et al. 2012) and are about PdCu, but it is rare that the permeability of PdCu40 reach that level. This is due to their surface cleaning method.
# Keeping these values will distort the distribution of data
bool_perm_higher_than_e_power_minus_7 = raw_data_1_col_nan_dropped['Hydrogen permeability (mol.m^-1.s^-1.Pa^-n)']>=1.0e-7
print(f'The number of data points with permeability value superior >=1.0e-7 are removed is: {raw_data_1_col_nan_dropped[bool_perm_higher_than_e_power_minus_7].shape[0]}')
print(f'The size of the starting dataset is {raw_data_1_col_nan_dropped.shape[0]}')
df6 = df6.drop(df6.loc[bool_perm_higher_than_e_power_minus_7].index, axis = 0).reset_index(drop=True)
raw_data_1_col_nan_dropped=df6

# Verify if concerned data points have been removed successfully still
print(f'The shape of the dataset after data points with permeability>=1.0e-7 are removed is: {raw_data_1_col_nan_dropped.shape[0]}')

The number of data points with permeability value superior >=1.0e-7 are removed is: 32
The size of the starting dataset is 2264
The shape of the dataset after data points with permeability>=1.0e-7 are removed is: 2232


In [None]:
%%time
raw_data_1_col_nan_dropped.isnull().sum()

CPU times: user 1.94 ms, sys: 969 µs, total: 2.9 ms
Wall time: 2.84 ms


Unnamed: 0,0
Alloy Formula,0
Composition type,0
Form,0
Composition in mole percent,0
Thickness (micron),0
Lattice parameter (nm),242
Bravais lattice,0
Temperature (°C),0
Fabrication technique,0
Pressure difference P1^n-P2^n (Pa^n),0


In [None]:
%%time
# Making sure all the missing lattice parameter values in the the 'Lattice parameter (nm)' belongs to the PdCu system :) Yes
raw_data_1_col_nan_dropped[raw_data_1_col_nan_dropped['Lattice parameter (nm)'].isnull()]['Alloy Formula'].unique()

CPU times: user 1.87 ms, sys: 0 ns, total: 1.87 ms
Wall time: 5.05 ms


array(['Pd20.0Cu80.0', 'Pd40.0Cu60.0', 'Pd45.0Cu55.0', 'Pd53.0Cu47.0',
       'Pd54.0Cu46.0', 'Pd41.30Cu58.70', 'Pd55.0Cu45.0', 'Pd55.81Cu44.19',
       'Pd56.0Cu44.0', 'Pd56.05Cu43.95', 'Pd57.47Cu42.53',
       'Pd57.79Cu42.21', 'Pd44.9Cu55.1', 'Pd58.0Cu42.0', 'Pd46.50Cu53.50',
       'Pd59.29Cu40.71', 'Pd59.6Cu40.4', 'Pd47.0Cu53.0', 'Pd47.15Cu52.85',
       'Pd60.0Cu40.0', 'Pd60Cu40', 'Pd47.4Cu52.6', 'Pd48.0Cu52.0',
       'Pd61Cu39', 'Pd48.5Cu51.5', 'Pd61.66Cu38.34', 'Pd62.21Cu37.79',
       'Pd62.50Cu37.50', 'Pd50.20Cu49.80', 'Pd63.79Cu36.21',
       'Pd64.11Cu35.89', 'Pd70.0Cu30.0', 'Pd10.0Cu90.0', 'Pd80.0Cu20.0',
       'Pd86.88Cu13.12', 'Pd88.46Cu11.54', 'Pd89.72Cu10.28',
       'Pd89.96Cu10.04', 'Pd90.0Cu10.0', 'Pd91.54Cu8.46', 'Pd92.09Cu7.91',
       'Pd94.0Cu6.0', 'Pd94.23Cu5.77', 'Pd95.57Cu4.43', 'Pd96.36Cu3.64',
       'Pd94.2Cu5.8', 'Pd97.47Cu2.53', 'Pd97.94Cu2.06'], dtype=object)

In [None]:
%%time
# Step 1: Parse string formulas into Composition objects
# This ensures robust parsing, even if formulas are inconsistently formatted
stc = StrToComposition(target_col_id="Composition_matrix")
print('Shape before formulas parsing into composition object:', raw_data_1_col_nan_dropped.shape)
raw_data_1_col_nan_dropped = stc.featurize_dataframe(raw_data_1_col_nan_dropped, col_id="Composition in mole percent")

# Step 2: Drop rows where parsing failed (Composition_matrix is NaN)
raw_data_1_col_nan_dropped = raw_data_1_col_nan_dropped.dropna(subset=["Composition_matrix"]).reset_index(drop=True)
print('Shape after formulas parsing into composition object:', raw_data_1_col_nan_dropped.shape)

# Step 3: Transform the composition object to element fraction matrix using Matminer's ElementFraction
ef = ElementFraction()
raw_data_1_col_nan_dropped = ef.featurize_dataframe(
    raw_data_1_col_nan_dropped,
    col_id="Composition_matrix",
    ignore_errors=True
)

# Step 4: Remove element columns with only zero values
raw_data_1_col_nan_dropped_matm = raw_data_1_col_nan_dropped.loc[:, (raw_data_1_col_nan_dropped != 0).any(axis=0)]
print('The shape of the dataframe with 0 element column dropped is:', raw_data_1_col_nan_dropped_matm.shape)

Shape before formulas parsing into composition object: (2232, 15)


StrToComposition:   0%|          | 0/2232 [00:00<?, ?it/s]

Shape after formulas parsing into composition object: (2232, 16)


ElementFraction:   0%|          | 0/2232 [00:00<?, ?it/s]

The shape of the dataframe with 0 element column dropped is: (2232, 56)
CPU times: user 2.08 s, sys: 5.74 s, total: 7.82 s
Wall time: 15.9 s


In [None]:
%%time
#Print the datatype of the of the composition object. Is it a dictionary object?
print(type(raw_data_1_col_nan_dropped_matm.loc[0, 'Composition_matrix'].as_dict()))

<class 'dict'>
CPU times: user 706 µs, sys: 0 ns, total: 706 µs
Wall time: 721 µs


In [None]:
%%time
# In other to filling the NaN in the lattice paramter column, we will be applying Matminer to handle this much easily
# Estimating the lattice parameter of missing alloys

# Write a function to estimate the lattice parameter of PdCu alloy systems based on their atomic composition
# The lattice parameter of FCC PdCu obeys Vegard's law of a = (2.75*1.0e-3)*X_Pd + 3.62 (in Angström) where X_Pd is the mole percentage
# The lattice paramter of BCC PdCu obeys Vegard's law of a = (1.57*1.0e-3)*X_Pd + 2.90 (in Angström) where X_Pd is the mole percentage
# Reference in (Al-Mufachi et al., 2015), doi: https://doi.org/10.1016/j.memsci.2015.07.015

# Determine the boundaries of the phase diagram of PdCu
# Reference in: (Subramanian and Laughlin, 1991), https://www.andrew.cmu.edu/user/dl0p/laughlin/pdf/128.pdf
# fcc (%Pd): 0-23.411513859275052
# B2+fcc (%Pd): 23.411513859275052-31.194029850746263
# B2 (%Pd): 31.194029850746263-49.53091684434967
# B2+fcc (%Pd): 49.53091684434967-65.09594882729209
# fcc(%Pd): 65.09594882729209

def PdCu_lattice(raw_data_1_col_nan_dropped_matm):
    """
    Estimates the lattice parameter for PdCu alloys with missing values.
    Uses phase diagram boundaries to determine structure and lattice parameter.
    Assuming that before conducting a measurment with mixed bcc/fcc alloy, researcher would anneal completely to B2 intermetallic
    """
    # Identify rows with missing lattice parameters
    dataframe_block = raw_data_1_col_nan_dropped_matm.loc[raw_data_1_col_nan_dropped_matm['Lattice parameter (nm)'].isnull()].copy()

    # Define Pd-Cu phase diagram ranges
    Pd_Cu = {'Pd', 'Cu'}
    fcc_1_range = [0, 23.4115]
    fcc_2_range = [65.0959, 100]
    B2_fcc_range_1 = [23.4115, 31.1940]
    B2_fcc_range_2 = [49.5309, 65.0959]
    B2_range = [31.1940, 49.5309]

    # Define lattice parameter equations
    def estimate_lattice(row):
        # Ensure composition is PdCu only
        element_dict = row['Composition_matrix'].as_dict()
        if set(element_dict.keys()) == Pd_Cu:
            amount_Pd = element_dict.get('Pd', 0)

            # Determine phase and calculate correponding lattice parameter
            if min(fcc_1_range) <= amount_Pd <= max(fcc_1_range) or min(fcc_2_range) <= amount_Pd <= max(fcc_2_range):
                return 'fcc', ((2.75e-3) * amount_Pd + 3.62) / 10 # We want the answer in nm instead of Angstrom
            elif min(B2_range) <= amount_Pd <= max(B2_range):
                return 'bcc', ((1.57e-3) * amount_Pd + 2.90) / 10
            elif min(B2_fcc_range_1) <= amount_Pd <= max(B2_fcc_range_1) or min(B2_fcc_range_2) <= amount_Pd <= max(B2_fcc_range_2):
                return 'bcc/fcc', (0.5 * ((2.75e-3) * amount_Pd + 3.62) + 0.5 * ((1.57e-3) * amount_Pd + 2.90)) / 10
            else:
                return 'Unknown', np.nan
        return row['Bravais lattice'], row['Lattice parameter (nm)']

    # Apply the function to determine phase & lattice parameter
    dataframe_block[['Bravais lattice', 'Lattice parameter (nm)']] = dataframe_block.apply(
        estimate_lattice, axis=1, result_type="expand"
    )

    # Update original dataframe safely
    raw_data_1_col_nan_dropped_matm.update(dataframe_block)
    return raw_data_1_col_nan_dropped_matm

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 12.6 µs


In [None]:
%%time
# Applying the function to the dataframe
df7=raw_data_1_col_nan_dropped_matm.copy()
raw_data_1_col_nan_dropped_matm = PdCu_lattice(df7)

CPU times: user 78 ms, sys: 982 µs, total: 79 ms
Wall time: 80.3 ms


In [None]:
# The application was successfull
%%time
raw_data_1_col_nan_dropped_matm.isnull().sum()

CPU times: user 4.43 ms, sys: 0 ns, total: 4.43 ms
Wall time: 6.95 ms


Unnamed: 0,0
Alloy Formula,0
Composition type,0
Form,0
Composition in mole percent,0
Thickness (micron),0
Lattice parameter (nm),0
Bravais lattice,0
Temperature (°C),0
Fabrication technique,0
Pressure difference P1^n-P2^n (Pa^n),0


In [None]:
%%time
def pymatcomp_obj_to_mpformula(x):
    """
    Converts a pymatgen Composition object (from StrToComposition)
    into a formula string with element order preserved.

    Parameters:
    - x: Composition object

    Returns:
    - str: string formula like 'Pd47.25Cu52.75'
    """
    comp_dict = x.as_dict()
    return ''.join(f"{el}{round(amt, 2)}" for el, amt in comp_dict.items())

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 11.4 µs


In [None]:
%%time
# We create a new column "Membrane Formula".
raw_data_1_col_nan_dropped_matm['Membrane Formula']=raw_data_1_col_nan_dropped_matm['Composition_matrix'].apply(pymatcomp_obj_to_mpformula)
s=raw_data_1_col_nan_dropped_matm['Membrane Formula']
raw_data_1_col_nan_dropped_matm.pop('Membrane Formula')
raw_data_1_col_nan_dropped_matm.insert(4,'Membrane Formula', s)

CPU times: user 23.8 ms, sys: 997 µs, total: 24.8 ms
Wall time: 48.2 ms


In [None]:
%%time
# All the columns of Pd have been appended 'Pd1.0' which is not elegant. We replace it by 'Pd'.
raw_data_1_col_nan_dropped_matm['Membrane Formula']=raw_data_1_col_nan_dropped_matm['Membrane Formula'].where(raw_data_1_col_nan_dropped_matm['Membrane Formula']!='Pd1.0', 'Pd')

CPU times: user 2.09 ms, sys: 0 ns, total: 2.09 ms
Wall time: 5.93 ms


In [None]:
%%time
# Checking
raw_data_1_col_nan_dropped_matm['Membrane Formula'].unique()

CPU times: user 792 µs, sys: 0 ns, total: 792 µs
Wall time: 800 µs


array(['Pd', 'Pd12.99Cu87.01', 'Pd20.22Ag79.78', 'Pd28.48Cu71.52',
       'Pd32.83Cu67.17', 'Pd33.0Ag59.0Ni8.0', 'Pd38.62Cu59.41Al1.97',
       'Pd38.89Cu60.58Al0.53', 'Pd40.0Cu49.0Au11.0', 'Pd40.0Cu53.0Au7.0',
       'Pd40.0Cu57.0Au3.0', 'Pd40.0Cu60.0', 'Pd40.21Cu57.86Al1.93',
       'Pd40.25Cu59.75', 'Pd40.57Cu57.18Al2.25', 'Pd41.0Cu58.51Al0.49',
       'Pd41.02Cu58.78Al0.2', 'Pd41.0Cu57.0Al2.0', 'Pd41.0Cu58.2Al0.8',
       'Pd41.0Cu58.8Al0.2', 'Pd41.22Cu58.78', 'Pd41.3Cu58.7',
       'Pd41.38Cu58.57Al0.05', 'Pd42.2Cu57.8', 'Pd43.0Cu56.2Al0.8',
       'Pd43.0Cu57.0', 'Pd43.19Cu56.81', 'Pd43.24Cu56.76',
       'Pd44.5Cu55.0Al0.5', 'Pd44.66Cu55.34', 'Pd44.74Cu52.1Al3.16',
       'Pd44.77Cu53.2Al2.03', 'Pd44.99Cu55.01', 'Pd44.9Cu55.1',
       'Pd45.0Cu53.0Al2.0', 'Pd45.0Cu53.0Ga2.0', 'Pd45.0Cu53.0In2.0',
       'Pd45.0Cu54.2Al0.8', 'Pd45.0Cu54.2Ga0.8', 'Pd45.0Cu54.2In0.8',
       'Pd45.0Cu54.8Al0.2', 'Pd45.0Cu54.8Ga0.2', 'Pd45.0Cu54.8In0.2',
       'Pd45.1Cu51.0Ag3.9', 'Pd45.25Cu53.77Al

In [None]:
# Drop duplicated rows, if there are still any
%%time
print('The shape of the dataframe before duplicated rows dropping:', raw_data_1_col_nan_dropped_matm.shape)
df8=raw_data_1_col_nan_dropped_matm.copy()
# df8[df8.iloc[:,5::].duplicated()==True]
to_show = ['Thickness (micron)', 'Lattice parameter (nm)', 'Temperature (°C)', 'Pressure difference P1^n-P2^n (Pa^n)', 'Pressure exponent (n)', 'Hydrogen permeability (mol.m^-1.s^-1.Pa^-n)']
df8=df8.drop_duplicates(subset=to_show, ignore_index=True)
raw_data_1_col_nan_dropped_matm=df8
print('The shape of the dataframe after duplicated rows dropped:', raw_data_1_col_nan_dropped_matm.shape)

The shape of the dataframe before duplicated rows dropping: (2232, 57)
The shape of the dataframe after duplicated rows dropped: (2223, 57)
CPU times: user 8.47 ms, sys: 0 ns, total: 8.47 ms
Wall time: 12 ms


In [None]:
# Utility function to convert unit into SI
%%time
def micron_to_meter(x):
    return np.nan if pd.isnull(x) else x / (1e6)

def celsius_to_kelvin(x):
    return np.nan if pd.isnull(x) else x + 273.15

def nanometer_to_meter(x):
    return np.nan if pd.isnull(x) else x / (1e9)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 11.7 µs


In [None]:
# Unit conversion
%%time
raw_data_1_col_nan_dropped_matm[['Thickness (micron)', 'Temperature (°C)',  'Lattice parameter (nm)']]= raw_data_1_col_nan_dropped_matm.transform({
    'Thickness (micron)':micron_to_meter,
    'Temperature (°C)':celsius_to_kelvin,
    'Lattice parameter (nm)':nanometer_to_meter
}
)
rename_dict = {'Thickness (micron)':'Thickness (m)', 'Temperature (°C)':'Temperature (K)', 'Lattice parameter (nm)':'Lattice parameter (m)', 'Pressure exponent (n)':'Pressure exponent n', 'Hydrogen permeability (mol.m^-1.s^-1.Pa^-n)':'Permeability (mol/m/s/Pa^n)'}

raw_data_1_col_nan_dropped_matm=raw_data_1_col_nan_dropped_matm.rename(columns=rename_dict)
raw_data_1_col_nan_dropped_matm.head()

CPU times: user 15.2 ms, sys: 0 ns, total: 15.2 ms
Wall time: 43.7 ms


Unnamed: 0,Alloy Formula,Composition type,Form,Composition in mole percent,Membrane Formula,Thickness (m),Lattice parameter (m),Bravais lattice,Temperature (K),Fabrication technique,Pressure difference P1^n-P2^n (Pa^n),Pressure exponent n,Permeability (mol/m/s/Pa^n),General comments,Reference,DOI/URL,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
0,Pd,mole percent,foil,Pd,Pd,0.00025,3.887e-10,fcc,737.15,Commercial Alfa Aesar,325.7194,0.5,9.24e-09,Membrane A see the supplementary materials in ...,"(Guerreiro et al., 2016)",https://doi.org/10.1016/j.memsci.2016.02.040,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pd,mole percent,disc,Pd,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,730.0685,0.5,1.32e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pd,mole percent,disc,Pd,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,632.4555,0.5,1.26e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pd,mole percent,disc,Pd,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,516.7204,0.5,1.26e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Pd,mole percent,disc,Pd,Pd,0.0007,3.89e-10,fcc,623.15,Cold rolling,816.7007,0.5,1.12e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
%%time
bool_mask = raw_data_1_col_nan_dropped_matm['Composition in mole percent']!=raw_data_1_col_nan_dropped_matm['Membrane Formula']
raw_data_1_col_nan_dropped_matm[bool_mask]

CPU times: user 2.51 ms, sys: 0 ns, total: 2.51 ms
Wall time: 5.04 ms


Unnamed: 0,Alloy Formula,Composition type,Form,Composition in mole percent,Membrane Formula,Thickness (m),Lattice parameter (m),Bravais lattice,Temperature (K),Fabrication technique,Pressure difference P1^n-P2^n (Pa^n),Pressure exponent n,Permeability (mol/m/s/Pa^n),General comments,Reference,DOI/URL,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
175,Pd41.00Cu58.51Al0.49,mole percent,foil,Pd41.00Cu58.51Al0.49,Pd41.0Cu58.51Al0.49,2.5e-05,2.952e-10,bcc/fcc,873.15,Arc melting and rolling,130.3881,0.5,8.9e-09,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu, Al)",0.0,0.0049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,Pd41.02Cu58.78Al0.20,mole percent,foil,Pd41.02Cu58.78Al0.20,Pd41.02Cu58.78Al0.2,2.5e-05,2.959e-10,bcc/fcc,873.15,Arc melting and rolling,130.3881,0.5,8.1e-09,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu, Al)",0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177,Pd41.0Cu57Al2.0,mole percent,foil,Pd41.0Cu57Al2.0,Pd41.0Cu57.0Al2.0,2.5e-05,2.914e-10,bcc/fcc,873.15,Cold rolling,315.2058,0.5,6.5e-09,The primary side was 0.3 MPaG and the secondar...,Japanese Patent JP 2012-201974,https://patents.google.com/patent/JP2012201974...,"(Pd, Cu, Al)",0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,873.15,Arc melting and rolling,130.3881,0.5,3.2e-09,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,623.53,Arc melting and rolling,130.3881,0.5,1.35e-08,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,773.52,Arc melting and rolling,130.3881,0.5,1.51e-08,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,673.17,Arc melting and rolling,130.3881,0.5,1.43e-08,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,723.17,Arc melting and rolling,130.3881,0.5,1.51e-08,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,Pd41.30Cu58.70,mole percent,foil,Pd41.30Cu58.70,Pd41.3Cu58.7,2.5e-05,2.964841e-10,bcc,873.52,Arc melting and rolling,130.3881,0.5,3.86e-09,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205,Pd44.50Cu55.00Al0.5,mole percent,foil,Pd44.50Cu55.00Al0.5,Pd44.5Cu55.0Al0.5,2.5e-05,2.957e-10,bcc/fcc,873.15,Arc melting and rolling,130.3881,0.5,1.08e-08,The primary side pressure = 0.1 MPaG and the s...,JP2010072926A,https://patents.google.com/patent/JP2011202258...,"(Pd, Cu, Al)",0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
%%time
# Drop the 'Composition in mole percent' column because its content is same as 'Membrane Formula', but the latter is better because created using pymatgen composition #object
df9=raw_data_1_col_nan_dropped_matm.copy()
df9=df9.drop(columns=['Composition in mole percent'])
raw_data_1_col_nan_dropped_matm=df9

CPU times: user 2.45 ms, sys: 6 µs, total: 2.46 ms
Wall time: 2.4 ms


In [None]:
%%time
# Renaming the 'Membrane Formula' column into 'Composition in mole percent'.
raw_data_1_col_nan_dropped_matm=raw_data_1_col_nan_dropped_matm.rename(columns={'Membrane Formula':'Composition in mole percent'})

CPU times: user 1.63 ms, sys: 7 µs, total: 1.63 ms
Wall time: 4.72 ms


In [None]:
%%time
# Check few raw if everything worked perfectly
raw_data_1_col_nan_dropped_matm.head()

CPU times: user 159 µs, sys: 0 ns, total: 159 µs
Wall time: 162 µs


Unnamed: 0,Alloy Formula,Composition type,Form,Composition in mole percent,Thickness (m),Lattice parameter (m),Bravais lattice,Temperature (K),Fabrication technique,Pressure difference P1^n-P2^n (Pa^n),Pressure exponent n,Permeability (mol/m/s/Pa^n),General comments,Reference,DOI/URL,Composition_matrix,B,Al,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Ga,Y,Zr,Nb,Mo,Ru,Rh,Pd,Ag,In,Sn,La,Ce,Pr,Sm,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu,Ta,W,Ir,Pt,Au,Pb
0,Pd,mole percent,foil,Pd,0.00025,3.887e-10,fcc,737.15,Commercial Alfa Aesar,325.7194,0.5,9.24e-09,Membrane A see the supplementary materials in ...,"(Guerreiro et al., 2016)",https://doi.org/10.1016/j.memsci.2016.02.040,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Pd,mole percent,disc,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,730.0685,0.5,1.32e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pd,mole percent,disc,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,632.4555,0.5,1.26e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pd,mole percent,disc,Pd,0.0007,3.89e-10,fcc,673.15,Cold rolling,516.7204,0.5,1.26e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Pd,mole percent,disc,Pd,0.0007,3.89e-10,fcc,623.15,Cold rolling,816.7007,0.5,1.12e-08,Table 1 data were extracted for equation 7. Th...,"(Sakamoto et al., 1992)",https://doi.org/10.1016/0925-8388(92)90468-O,(Pd),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
raw_data_1_col_nan_dropped_matm['Composition in mole percent'].shape

(2223,)

In [None]:
# len(raw_data_1_col_nan_dropped_matm['Reference'].unique())

In [None]:
# Define your directory (I save using pickle because it conserved my datatype from notebooks to notebooks, while csv can't)
%%time

# Define filename
filename = 'Pd_membrane_data_preprocessed.pkl'

# Define save directory
out_path = '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning'

# Build the full path safely
file_path = os.path.join(out_path, filename)

# Save the DataFrame
raw_data_1_col_nan_dropped_matm.to_pickle(file_path)

# Load it later
# df_loaded = pd.read_pickle(file_path)

CPU times: user 12.7 ms, sys: 3.02 ms, total: 15.7 ms
Wall time: 83.6 ms


In [None]:
# # Saving the dataset
# %%time
# # Define output path
# out_path = '/content/drive/MyDrive/PhD_Pd alloy prediction/Resut_Data_cleaning/Pd_Membrane_Preprocessed.csv'

# # Save DataFrame to CSV
# raw_data_1_col_nan_dropped_matm.to_csv(out_path, index=False)

# # Optional: Confirm it was saved
# print(f"Data successfully saved to: {out_path}")