### Beginning with pandas df

In [None]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

data = pd.read_csv(r'/Users/emiljaffal/Desktop/input.txt', header=None)
data.columns = ['Compound']
data

## splitting into binary & ternary

Found the following to help define parsing compounds: 
    
    https://stackoverflow.com/questions/9782835/break-string-into-list-elements-based-on-keywords
    
Retrieved:

    print(re.findall(r'[A-Z][a-z]*|\d+', re.sub('[A-Z][a-z]*(?![\da-z])', r'\g<0>1', molecule)))
 
Applying to a dataframe gave an error as string/bytes was the only acceptable input based on above.

In [2]:
import re

df = pd.DataFrame(data)

# Setup new dataframes
binary = pd.DataFrame(columns=['Compound'])
ternary = pd.DataFrame(columns=['Compound'])

# Define fxn to count elements in a compound
def count_elements(compound_str):
    # From above
    elements = re.findall(r'[A-Z][a-z]*\d*|\d+', re.sub(r'([A-Z][a-z]*)', r'\g<0>1', compound_str))
    return len(elements)

# Filter rows
binary['Compound'] = df[df['Compound'].apply(count_elements) == 2]['Compound']
ternary['Compound'] = df[df['Compound'].apply(count_elements) == 3]['Compound']

Found a function to view pandas dataframes side by side:

    https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side

In [3]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [4]:
display_side_by_side(binary,ternary, titles=['binary','ternary'])

Unnamed: 0,Compound
3,GaPd
4,Y5In4

Unnamed: 0,Compound
0,ScPtGe
1,Rh3Ru7Th2
2,Ga2Os9Ce
5,Gd3In5Fe2


### Parse the formula (element - index, element - index...)

Ended up getting stuck here, so I deferred to chatGPT. Never really defined functions so this is a weak point of mine.

In [27]:
# Define a function to parse compound-index pairs
def parse_compound_index(compound_str):
    # This line parses the compound string into elements and indices
    compounds = re.findall(r'([A-Z][a-z]*)(\d*)', compound_str)
    
    # Create lists to store element names and indices
    elements = []
    indices = []
    
    # Iterate over the parsed compounds
    for compound, index in compounds:
        # Append element and index to the respective lists
        elements.append(compound)
        indices.append(index if index else '1')  # Use '1' if index is empty, meaning 1 element in compound
    
    # Create an empty dictionary with the separate columns
    parsed_dict = {}
    for i, (element, index) in enumerate(zip(elements, indices), start=1): #start count of e/i @ 1
        parsed_dict[f'element_{i}'] = element # element [1]
        parsed_dict[f'index_{i}'] = index #index [1]
    
    return parsed_dict

# Apply the function to the 'Compound' column and add new columns to the DataFrames
parsed_binary = pd.concat([binary['Compound'],
                           binary['Compound'].apply(parse_compound_index).apply(pd.Series)],
                          axis=1)

parsed_ternary = pd.concat([ternary['Compound'],
                           ternary['Compound'].apply(parse_compound_index).apply(pd.Series)],
                          axis=1)

In [101]:
display_side_by_side(parsed_binary,parsed_ternary, titles=['P. binary','P. ternary'])

Unnamed: 0,Compound,element_1,index_1,element_2,index_2,reordered_compound
3,GaPd,Ga,1,Pd,1,Ga1Pd1
4,Y5In4,Y,5,In,4,In4Y5

Unnamed: 0,Compound,element_1,index_1,element_2,index_2,element_3,index_3,reordered_compound
0,ScPtGe,Sc,1,Pt,1,Ge,1,Sc1Pt1Ge1
1,Rh3Ru7Th2,Rh,3,Ru,7,Th,2,Th2Rh3Ru7
2,Ga2Os9Ce,Ga,2,Os,9,Ce,1,Ce1Ga2Os9
5,Gd3In5Fe2,Gd,3,In,5,Fe,2,Fe2Gd3In5


### Rearrange the formula based on the index value (small to large)

In [97]:
# Function to reorder the compound formula
def reorder_compound(row):
    elements = [row['element_1'], row['element_2']]
    indices = [row['index_1'], row['index_2']]
    reordered_elements = [element + str(index) for index, element in sorted(zip(indices, elements), key=lambda x: x[0])]
    return ''.join(reordered_elements)

# Create a new column 'reordered_compound'
parsed_binary['reordered_compound'] = parsed_binary.apply(reorder_compound, axis=1)

# Rename the DataFrame to parsed_binary_index
parsed_binary_index = parsed_binary.rename(columns={'reordered_compound': 'parsed_binary_index'})

In [98]:
# Function to reorder the compound formula for parsed_ternary
def reorder_compound_ternary(row):
    elements = [row['element_1'], row['element_2'], row['element_3']]
    indices = [row['index_1'], row['index_2'], row['index_3']]
    reordered_elements = [element + str(index) for index, element in sorted(zip(indices, elements), key=lambda x: x[0])]
    return ''.join(reordered_elements)

# Create a new column 'reordered_compound' for parsed_ternary
parsed_ternary['reordered_compound'] = parsed_ternary.apply(reorder_compound_ternary, axis=1)

# Rename the DataFrame to parsed_ternary_index
parsed_ternary_index = parsed_ternary.rename(columns={'reordered_compound': 'parsed_ternary_index'})

In [100]:
display_side_by_side(parsed_binary_index,parsed_ternary_index, titles=['P.I. binary','P.I. ternary'])

Unnamed: 0,Compound,element_1,index_1,element_2,index_2,parsed_binary_index
3,GaPd,Ga,1,Pd,1,Ga1Pd1
4,Y5In4,Y,5,In,4,In4Y5

Unnamed: 0,Compound,element_1,index_1,element_2,index_2,element_3,index_3,parsed_ternary_index
0,ScPtGe,Sc,1,Pt,1,Ge,1,Sc1Pt1Ge1
1,Rh3Ru7Th2,Rh,3,Ru,7,Th,2,Th2Rh3Ru7
2,Ga2Os9Ce,Ga,2,Os,9,Ce,1,Ce1Ga2Os9
5,Gd3In5Fe2,Gd,3,In,5,Fe,2,Fe2Gd3In5


### Rearrange the formula based on EN values (from the least electronegative to the most electronegative elements, using the file with element data provided)

Following API:
    
    https://hackingmaterials.lbl.gov/matminer/installation.html
    
Getting the electronegativities was simple enough, but the API was difficult to follow along with

In [31]:
from matminer.featurizers.composition.composite import ElementProperty
from pymatgen.core import Composition

# Define composition
composition_str = "GaPd"
composition = Composition(composition_str)

# Initialize the ElementProperty featurizer
ep_featurizer = ElementProperty.from_preset(preset_name="magpie")

# Get the electronegativity for each element in the composition
electronegativity_values = ep_featurizer.featurize(composition)

# Display the results
for element, electronegativity in zip(composition.elements, electronegativity_values):
    print(f"{element.symbol}: {electronegativity}")

Ga: 31.0
Pd: 46.0


The rest was difficult, so I once again opted to use chatGPT after some trouble in searching specifics on reordering a column based on certain corresponding rows on stackflow

In [103]:
from pymatgen.core import Element

# Initialize the ElementProperty featurizer
ep_featurizer = ElementProperty.from_preset(preset_name="magpie")

# Define a function to get electronegativity values
def get_electronegativity(row):
    element_1 = Element(row['element_1'])
    element_2 = Element(row['element_2'])
    comp = Composition({element_1.symbol: 1, element_2.symbol: 1})
    en_1 = ep_featurizer.featurize(comp)[0]
    en_2 = ep_featurizer.featurize(comp)[1]
    return en_1, en_2

# Apply the function to each row
en_values1 = parsed_binary_sort_index.apply(get_electronegativity, axis=1, result_type='expand')

# Rename the columns
en_values1.columns = ['EN_1', 'EN_2']

# Concatenate the DataFrames and reorder columns
parsed_binary_sort_index1 = pd.concat([parsed_binary_sort_index, en_values1], axis=1)[['Compound', 'element_1', 'EN_1', 'index_1', 'element_2', 'EN_2', 'index_2']]

In [105]:
# Initialize the ElementProperty featurizer
ep_featurizer = ElementProperty.from_preset(preset_name="magpie")

# Define a function to get electronegativity values
def get_electronegativity(row, element_col, index_col):
    element = Element(row[element_col])
    comp = Composition({element.symbol: 1})
    en = ep_featurizer.featurize(comp)[0]
    return en

# Create electronegativity columns dynamically
for i in range(1, 4):  # Assuming up to 3 elements
    element_col = f'element_{i}'
    index_col = f'index_{i}'
    en_col = f'EN_{i}'
    parsed_ternary_sort_index[en_col] = parsed_ternary_sort_index.apply(get_electronegativity, axis=1, element_col=element_col, index_col=index_col)

# Reorder columns
column_order = ['Compound']
for i in range(1, 4):
    column_order.extend([f'element_{i}', f'EN_{i}', f'index_{i}'])
column_order.extend(list(parsed_ternary_sort_index.columns[10:]))  # Adjust the index range based on the number of additional columns

parsed_ternary_en = parsed_ternary_sort_index[column_order]

Getting ChatGPT to reorder was also somewhat difficult

In [106]:
# Create a function to reorder elements based on electronegativity
def reorder_compound(row):
    elements = [f"EN_{i}" for i in range(1, 4)]
    elements_with_values = [(element, row[element], f'element_{i}', f'index_{i}') for i, element in enumerate(elements, start=1) if not pd.isnull(row[element])]
    sorted_elements = sorted(elements_with_values, key=lambda x: (x[1], x[3]))  # Sort by electronegativity, preserving original order when values are equal
    reordered_compound = ''.join([row[element] + str(int(row[index])) for _, _, element, index in sorted_elements])
    return reordered_compound

# Apply the function to create the 'reordered' column
parsed_ternary_en['reordered'] = parsed_ternary_en.apply(reorder_compound, axis=1)

# Reorder columns
column_order_with_reordered = list(parsed_ternary_en.columns[:-1]) + ['reordered']
parsed_ternary_sort_en = parsed_ternary_en[column_order_with_reordered]

In [107]:
display_side_by_side(parsed_binary_sort_index1,parsed_ternary_sort_en, titles=['P.E.N. binary','P.E.N. ternary'])

Unnamed: 0,Compound,element_1,EN_1,index_1,element_2,EN_2,index_2
4,Y5In4,Y,39.0,5,In,49.0,4
3,GaPd,Ga,31.0,1,Pd,46.0,1

Unnamed: 0,Compound,element_1,EN_1,index_1,element_2,EN_2,index_2,element_3,EN_3,index_3,reordered
1,Rh3Ru7Th2,Rh,45.0,3,Ru,44.0,7,Th,90.0,2,Ru7Rh3Th2
5,Gd3In5Fe2,Gd,64.0,3,In,49.0,5,Fe,26.0,2,Fe2In5Gd3
2,Ga2Os9Ce,Ga,31.0,2,Os,76.0,9,Ce,58.0,1,Ga2Ce1Os9
0,ScPtGe,Sc,21.0,1,Pt,78.0,1,Ge,32.0,1,Sc1Ge1Pt1


Below was an attempt at manually inputting the electronegativities as it wasn't working with matminer earlier, also not sure what methodology they're using above? Below are Pauling electronegativities:

In [12]:
# Define a dictionary with electronegativity values for the first 50 elements
electronegativity_first50 = {
    "H": 2.20,
    "He": 0,
    "Li": 0.98,
    "Be": 1.57,
    "B": 2.04,
    "C": 2.55,
    "N": 3.04,
    "O": 3.50,
    "F": 3.98,
    "Ne": 0,
    "Na": 0.93,
    "Mg": 1.31,
    "Al": 1.61,
    "Si": 1.90,
    "P": 2.19,
    "S": 2.58,
    "Cl": 3.16,
    "Ar": 0,
    "K": 0.82,
    "Ca": 1.00,
    "Sc": 1.36,
    "Ti": 1.54,
    "V": 1.63,
    "Cr": 1.66,
    "Mn": 1.55,
    "Fe": 1.83,
    "Co": 1.88,
    "Ni": 1.91,
    "Cu": 1.90,
    "Zn": 1.65,
    "Ga": 1.81,
    "Ge": 2.01,
    "As": 2.18,
    "Se": 2.55,
    "Br": 2.96,
    "Kr": 3.00,
    "Rb": 0.82,
    "Sr": 0.95,
    "Y": 1.22,
    "Zr": 1.33,
    "Nb": 1.60,
    "Mo": 2.16,
    "Tc": 1.90,
    "Ru": 2.20,
    "Rh": 2.28,
    "Pd": 2.20,
    "Ag": 1.93,
    "Cd": 1.69,
    "In": 1.78,
    "Sn": 1.96,
    "Sb": 2.05,
}

# Define composition
composition_str = "GaPd"
composition = Composition(composition_str)

# Get the electronegativity for each element in the composition
electronegativity_values = [electronegativity_first50.get(element.symbol, None) for element in composition.elements]

# Display the results
for element, electronegativity in zip(composition.elements, electronegativity_values):
    print(f"{element.symbol}: {electronegativity}")

Ga: 1.81
Pd: 2.2
