In [1]:
import pandas as pd
import numpy as np
import pickle
from pymatgen.core import Composition
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty

# Function to convert a composition to its reduced form.
# If the formula is invalid, it returns NaN.
def get_reduced_formula(composition):
    try:
        comp = Composition(composition)
        return comp.reduced_formula
    except ValueError:
        return np.nan

# Function to filter the DataFrame based on the composition.
# Returns a DataFrame where the composition contains the input composition.
def filter_data_on_composition(df, composition):
    mask = df['new_composition'].str.contains(composition)
    return df[mask].copy()

# Function to apply formula reduction to the DataFrame.
# Applies the formula reduction to the 'new_composition' column, 
# and returns the DataFrame after dropping NaN values and duplicate rows.
def apply_formula_reduction(df):
    df['reduced_formula'] = df['new_composition'].apply(get_reduced_formula)
    df.dropna(subset=['reduced_formula'], inplace=True)
    df.drop_duplicates(subset='reduced_formula', inplace=True)
    return df

# Function to filter out formulas from df_main that match with df_filter.
# Returns df_main after removing rows where the column matches with the 'reduced_formula' column of df_filter.
def filter_out_matching_formulas(df_main, df_filter, column):
    mask = df_main[column].isin(df_filter['reduced_formula'])
    return df_main[~mask]

# Function to generate temperature data.
# Returns a DataFrame with a 'temperature' column populated with values from start to stop (inclusive) with the given step size.
def generate_temperature_data(start, stop, step):
    return pd.DataFrame({'temperature': np.arange(start, stop + 1, step)})

# Function to create the cartesian product of DataFrame and temperature DataFrame.
def create_cartesian_product(df, temperature_df):
    df = df.assign(key=1).merge(temperature_df.assign(key=1), on='key').drop('key', axis=1)
    return df

# Function to transform and featurize the DataFrame.
# Transforms the column into a Composition object, calculates Magpie features from the Composition object,
# and returns the DataFrame after dropping the 'composition' column.
def transform_and_featurize(df, column):
    str_to_comp = StrToComposition()
    df = str_to_comp.featurize_dataframe(df, column)

    ep = ElementProperty.from_preset(preset_name="magpie")
    df = ep.featurize_dataframe(df, "composition")

    df.drop(["composition"],axis=1, inplace=True)
    return df

# Function to load model from file.
def load_model(filename):
    with open(filename, 'rb') as file:
        model = pickle.load(file)
    return model

# Function to make predictions using the model.
# Makes predictions using the model, 
# and returns the DataFrame with a new 'predicted' column containing the predictions.
def predict(df, model):
    X = df.drop(["Pretty Formula", "Density", "Uranium Density"], axis=1)
    pred = model.predict(X)
    df["predicted"] = pred
    return df

# Function to filter DataFrame based on prediction value.
# Returns the DataFrame where the 'predicted' column is equal to the input value.
def filter_on_prediction(df, value):
    mask = df['predicted'] == value
    return df[mask]

# Function to load melting point data from a CSV file.
# Returns a dictionary where the keys are element symbols and the values are their melting points.
def load_melting_point_dict(csv_file):
    melting_df = pd.read_csv(csv_file)
    return melting_df.set_index('Element')['Melting Point'].to_dict()

# Function to filter DataFrame based on melting point threshold.
# Returns the DataFrame after removing rows where the melting point of any element in the 'Pretty Formula' column is less than the threshold.
def filter_on_melting_point(df, melting_dict, threshold):
    rows_to_remove = []
    for index, row in df.iterrows():
        comp = Composition(row['Pretty Formula'])
        elements = comp.elements  # Get the list of Elements in the compound
        for element in elements:
            symbol = str(element)
            melting_point = melting_dict.get(symbol, threshold + 1)  # Default to threshold + 1 if no info is available
            if melting_point < threshold:
                rows_to_remove.append(index)
                break  # No need to check other elements if we're removing the row
    return df.drop(rows_to_remove)

# Main function to run the entire process.
# Performs data loading, filtering, temperature data generation, transformation and featurization, 
# model loading, prediction, and final filtering based on prediction and melting point.
def main():
    # Loading the training data
    training_data = pd.read_csv("data/training_data_with_composition.csv")
    training_data_filtered = filter_data_on_composition(training_data, 'U')
    training_data_filtered = apply_formula_reduction(training_data_filtered)

    # Load the prediction data
    df = pd.read_csv("data/prediction_data/mp_uranium_compounds.csv")
    df = filter_out_matching_formulas(df, training_data_filtered, 'Pretty Formula')

    # Generate temperature data
    temperature_df = generate_temperature_data(300, 1000, 100)
    df = create_cartesian_product(df, temperature_df)

    # Featurize the DataFrame
    df = transform_and_featurize(df, 'Pretty Formula')
    df = df.rename(columns={'temperature': 'Temperature'})

    # Load the model and make predictions
    model = load_model('smote_model/smote_model.pkl')
    df = predict(df, model)

    df_simple = df[["Pretty Formula","Temperature","predicted", "Density", "Uranium Density"]]
    candidate_df = filter_on_prediction(df_simple, 2)

    melting_dict = load_melting_point_dict("data/prediction_data/melting_point_metals.csv")
    filtered_df = filter_on_melting_point(candidate_df, melting_dict, 1500)

    filtered_df.drop(["Density"],axis=1, inplace=True)
    
    # Sort by 'Uranium Density' and 'Temperature', then drop duplicates
    sorted_df = filtered_df.sort_values(by=['Uranium Density', 'Temperature'], ascending=[False, True])
    # Output final candidates with temperature
    sorted_df.to_csv("final_candidates_with_temperature.csv",index=False)

    sorted_unique_df = sorted_df.drop_duplicates(subset='Pretty Formula')
    # Output final candidates without temperature
    sorted_unique_df = sorted_unique_df.drop(["Temperature"],axis=1)
    sorted_unique_df.to_csv("final_candidates_without_temperature.csv",index=False)

if __name__ == "__main__":
    main()

StrToComposition:   0%|          | 0/6192 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/6192 [00:00<?, ?it/s]