In [1]:
import pandas as pd
import numpy as np
import re
import ast
import collections
from operator import itemgetter
from pymatgen.core.periodic_table import Element

# Get a list of all element symbols from pymatgen
elements_list = [str(e) for e in Element]

# Function to check whether a composition is invalid.
# An invalid composition is one that is NaN, not a string, or an empty string.
def is_invalid_composition(x):
    return pd.isnull(x) or not isinstance(x, str) or x.strip() == ""

# Function to extract category from sample information.
# If the sample information is a dict and has the 'Form' key, the category is extracted from the 'Form' dict.
# If the category is not available, the function returns None.
def extract_category(x):
    if pd.isnull(x) or x == '':
        return None
    try:
        sample_dict = ast.literal_eval(x)
        if isinstance(sample_dict, dict) and 'Form' in sample_dict:
            form_dict = sample_dict['Form']
            if isinstance(form_dict, dict) and 'category' in form_dict:
                return form_dict['category']
    except (ValueError, KeyError):
        pass
    return None

# Function to check for special symbols in a composition.
# Returns True if a special symbol is found.
def check_special_symbols(composition):
    pattern = r'[^A-Za-z0-9\.()]'
    return bool(re.search(pattern, composition))

# Function to remove rows based on a condition.
# The condition is when a lowercase letter follows a number or a closing parenthesis, without any uppercase letter in between.
def remove_rows(composition):
    return bool(re.search(r'(?<=[0-9\)])(?<![A-Za-z])[a-z]', composition))

# Function to check if parentheses in a composition are balanced.
# Returns NaN if the parentheses are unbalanced.
def check_parentheses_balance(composition):
    return np.nan if composition.count('(') != composition.count(')') else composition

# Function to assign a coefficient of 1 where no coefficient is provided in a composition.
# Assigns 1 to elements followed by another element, parentheses, or the end of string.
# Assigns 1 to closing parentheses followed by another opening parenthesis, an element, or the end of string.
def assign_one(composition):
    if composition is np.nan:
        return composition
    # If there is a close parenthesis, followed by another close parenthesis, or a letter (start of an element name), assign a 1
    composition = re.sub(r"(?<=\))(?=\))", r"1", composition)
    composition = re.sub(r"([A-Z][a-z]*)(?=[A-Z])", r"\g<1>1", composition)
    composition = re.sub(r"([A-Z][a-z]*)(?=[\(\)])", r"\g<1>1", composition)
    composition = re.sub(r"([A-Z][a-z]*)(?=$)", r"\g<1>1", composition)
    composition = re.sub(r"(?<=\))(?=\()", r"1", composition)
    composition = re.sub(r"(?<=\))(?=$)", r"1", composition)
    composition = re.sub(r"(?<=\))(?=[A-Z])", r"1", composition)
    return composition

# Function to validate the composition based on a provided list of elements. 
# It checks whether all elements in the composition are in the provided list,
# verifies that all coefficients and multipliers are valid non-zero numbers, 
# and returns NaN for invalid compositions.
def check_composition_validity(composition, elements_list):
    if composition is np.nan:
        return composition
    components = re.findall(r"([A-Za-z]+)([\d\.]*)", composition)
    for element, coefficient in components:
        if element not in elements_list:
            return np.nan
        try:
            coefficient_value = float(coefficient) if "." in coefficient else int(coefficient)
            if coefficient_value == 0:
                return np.nan
        except ValueError:
            return np.nan
    brackets_multipliers = re.findall(r"\)([\d\.]*)", composition)
    for multiplier in brackets_multipliers:
        try:
            multiplier_value = float(multiplier) if "." in multiplier else int(multiplier)
            if multiplier_value == 0:
                return np.nan
        except ValueError:
            return np.nan
    return composition

# Function to expand brackets in a composition.
# It repeatedly searches for the rightmost opening parenthesis and expands the elements and counts within the parentheses.
def expand_brackets(composition):
    if composition is np.nan:
        return composition
    opening_index = composition.rfind('(')
    if opening_index == -1:
        return composition
    closing_index = composition.find(')', opening_index)
    if closing_index == -1:
        return np.nan
    match = re.search(r'(\d*\.\d+|\d+)', composition[closing_index+1:])
    if match:
        multiplier = float(match.group())
        closing_index += len(match.group())
        elements = re.findall(r"([A-Za-z]+)(\d*\.\d+|\d+)", composition[opening_index+1 : closing_index-1])
        expanded_substring = ''.join(f"{element}{float(count)*multiplier}" for element, count in elements)
    return expand_brackets(composition[:opening_index] + expanded_substring + composition[closing_index+1:])

# Function to merge duplicate elements in a formula.
# It sums the counts of duplicate elements and returns a new formula.
def merge_formula(formula):
    elements = re.findall(r"([A-Z][a-z]*)([\d\.]+)", formula)
    element_counts = collections.defaultdict(float)
    for element, count in elements:
        element_counts[element] += float(count)
    merged_formula = ''.join(f"{element}{count}" for element, count in element_counts.items())
    return merged_formula

# Function to extract two main elements from the composition.
# The main elements are the ones with the highest counts in the composition.
# If there are more than two elements, the two elements with the highest counts are returned in alphabetical order.
def extract_main_elements(composition):
    # Find all elements and their counts in the composition using regular expressions.
    elements = re.findall(r'([A-Z][a-z]*)([\d\.]*)', composition)
    
    # Convert counts to float numbers or assign 1.0 if no count is given.
    elements = [(ele, float(val) if val else 1.0) for ele, val in elements]
    
    # Sort the elements by their count in descending order.
    elements.sort(key=itemgetter(1), reverse=True)
    
    # Extract element names
    elements = [ele for ele, val in elements]
    
    # Return two main elements or one element if there is only one element in the composition.
    if len(elements) > 1:
        # If there are two or more elements, sort them alphabetically and join into a string
        return ' '.join(sorted(elements[:2]))
    else:
        # If there is only one element, return it
        return elements[0]

# Function for checking if the composition contains all elements in a given set.
# Returns True if the composition contains all of "U", "O", "F", "P". This is used to remove certain unparsable compounds.
def contains_fission_products(composition):
    # Returns True if the composition contains all of "U", "O", "F", "P" this removes FP which cannot be parsed correctly
    elements = ['U', 'O', 'F', 'P']
    return all(element in composition for element in elements)

# Load the CSV file
df = pd.read_csv("20230112_interpolated_data.csv")

# Select the columns to keep
df = df[['composition', 'sampleinfo', 'Temperature', "Thermal conductivity", "total thermal conductivity"]]

# Filter based on temperature range
df = df[(df['Temperature'] >= 300) & (df['Temperature'] <= 1000)]

# Merge conductivity columns
df['Merged conductivity'] = np.where(
    (~df['Thermal conductivity'].isna()) & (df['Thermal conductivity'] != '') &
    (~df['total thermal conductivity'].isna()) & (df['total thermal conductivity'] != '') &
    (df['Thermal conductivity'] != df['total thermal conductivity']),
    np.maximum(df['Thermal conductivity'], df['total thermal conductivity']),
    np.where(df['Thermal conductivity'].isna() | (df['Thermal conductivity'] == ''),
             df['total thermal conductivity'],
             df['Thermal conductivity']
            )
)

# Drop unnecessary columns
df = df.drop(["Thermal conductivity","total thermal conductivity"], axis=1)

# Drop rows with missing 'Merged conductivity' values
df = df.dropna(subset=['Merged conductivity'])

# Remove rows with empty string and negative or zero conductivity
df = df[df['Merged conductivity'] != '']
df = df[df['Merged conductivity'].astype(float) > 0]
df = df.reset_index(drop=True)

# Apply necessary functions to clean the data
invalid_mask = df['composition'].apply(is_invalid_composition)
df = df[~invalid_mask].reset_index(drop=True)

df['Category'] = df['sampleinfo'].apply(extract_category)

categories_to_remove = ["SingleCrystal", 'Powder', 'Ribbon', 'Film', "multilayer film", 'EpitaxialFilm', 'FoamedBulk', 'Aerogel', 'Coating']
mask = df['Category'].isin(categories_to_remove)
df = df[~mask]

df = df.drop(["sampleinfo","Category"], axis=1)

df['composition'] = df['composition'].str.replace(r'\s', '', regex=True)

df = df[~df['composition'].apply(check_special_symbols)]
df = df[~df['composition'].apply(remove_rows)]
df['composition'] = df['composition'].apply(check_parentheses_balance)
df['composition'] = df['composition'].apply(assign_one)

df['composition_with_ones'] = df['composition'].apply(lambda x: check_composition_validity(x, elements_list))

df['new_composition'] = df['composition_with_ones'].apply(expand_brackets)

df = df.dropna(subset=['new_composition'])

df['new_composition'] = df['new_composition'].apply(merge_formula)

df = df.drop(["composition","composition_with_ones"], axis=1)

df = df[df['Merged conductivity'] <= 500]

from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty

# Convert the composition from string to pymatgen Composition object
str_to_comp = StrToComposition()
df = str_to_comp.featurize_dataframe(df, 'new_composition')

# Calculate Magpie features from the Composition object
ep = ElementProperty.from_preset(preset_name="magpie")
df = ep.featurize_dataframe(df, "composition")

# Binning and grouping 
bins = [-np.inf, 5, 15, np.inf]  # The intervals you defined
labels = ['0', '1', '2']  # The class labels
df['class'] = pd.cut(df['Merged conductivity'], bins=bins, labels=labels)

# Apply the function to the 'new_composition' column to create a new column 'main_elements'
df['main_elements'] = df['new_composition'].apply(extract_main_elements)

df = df[~df['new_composition'].apply(contains_fission_products)]

df = df.drop(["Merged conductivity","composition"],axis=1)
df.to_csv("training_data_with_composition.csv",index=False)

df = df.drop(["new_composition"],axis = 1)
df.to_csv("training_data.csv",index=False)

StrToComposition:   0%|          | 0/168996 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/168996 [00:00<?, ?it/s]