In [7]:
# 1st Cell
import polyfingerprints as pfp
import pandas as pd
import numpy as np
from wrapconfig import YAMLWrapConfig
import os
from polyfingerprints import models as pfp_models
import torch

In [14]:
# 2nd Cell: Specify the model details and the inference data source.
OUTDIR="out"
BASEMODEL_DIR=os.path.join(OUTDIR,"models")
MODELNAME="testmodel"
INFERCSV="cloud_points_infer.csv"

In [18]:
# 3rd Cell: Choosing model. Loading Data.
# Set the model data based on the model name and the base model directory
# and load the configurations for data reading and model hyperparameters.
modeldir=os.path.abspath(os.path.join(BASEMODEL_DIR,MODELNAME))
if not os.path.exists(modeldir):
    raise ValueError("the defined model does not exist")
traininfofile = YAMLWrapConfig(os.path.join(modeldir,"expanded_info.yml"),default_save=False)
hyperparameter=YAMLWrapConfig(os.path.join(modeldir,"hyperparameter.yml"),default_save=False)

# Load the data file to be used for inference. If cloud_point is in the columns it will be dropped,
# as it is the target value.
infer_df=pd.read_csv(INFERCSV)
if "cloud_point" in infer_df.columns:
    interdf=infer_df.drop("cloud_point",axis=1)

infer_df

In [20]:
# 4th Cell
# Update the columns in the same way as in the as in data curation
# and expand the data to the same format as the training data.
infer_df["pH"] = infer_df["pH"].fillna(7)
DEF_TYPE_MAP = {
        "A": 0.1,
        "B": 0.2,
        "C": 0.5,
        "DSC": 0.01,
    }
infer_df["def_type"] = infer_df["def_type"].fillna("0.1")
infer_df["def_type"] = infer_df["def_type"].apply(
    lambda x: float(
        (DEF_TYPE_MAP[x] if x in DEF_TYPE_MAP else str(x).replace(",", "."))
    ),
)

# Set poly_conc to polymer_concentration_mass_conc where it is NaN.
if "polymer_concentration_mass_conc" in infer_df.columns:
    infer_df["polymer_concentration_wpercent"] = infer_df["polymer_concentration_wpercent"].fillna(infer_df["polymer_concentration_mass_conc"])
    infer_df= infer_df.drop(
        ["polymer_concentration_mass_conc"], axis=1
    )

df,info = pfp.datareader.expand_data(
    infer_df,
    ignored_columns=[],
    density=1,
additive_wp_column_postfix = "_concentration_weight_percent",
    additive_conc_column_postfix="_concentration_molar"
)

additives_rename={}
for i,add in enumerate(info["additives"]):
    additives_rename[f"additive_{i}"]=f"additive_{traininfofile['additives'].index(add)}"

additives_missing=[f"additive_{i}" for i,add in enumerate(traininfofile["additives"]) if add not in info["additives"]]

df.rename(additives_rename,inplace=True,axis=1)
for madd in additives_missing:
    if madd in df.columns:
        raise ValueError("this should not happen")
    df[madd]=0

additional_columns=traininfofile["numerical_columns"]+traininfofile["categorical_columns"]
if "cloud_point" in additional_columns:
    additional_columns.remove("cloud_point")

pfpdata = pfp.loader.df_loader(
        df=df,
        repeating_unit_columns=list(
            zip(
                info["repeating_unit_columns"],
                info["molpercent_repeating_unit_columns"],
            )
        ),
        mw_column="Mn",
        start_group_column="SMILES_start_group",
        end_group_column="SMILES_end_group",
        additional_columns= additional_columns,
        intersection_fp_size=hyperparameter["pfp_data"]["intersection_fp_size"],
        enhanced_sum_fp_size=hyperparameter["pfp_data"]["enhanced_sum_fp_size"]
    )

Unnamed: 0,SMILES_start_group,SMILES_end_group,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,molpercent_repeating_unitB,SMILES_repeating_unitC,molpercent_repeating_unitC,SMILES_repeating_unitD,molpercent_repeating_unitD,...,additive_23,additive_24,additive_25,additive_26,additive_27,additive_28,additive_29,additive_30,additive_32,additive_33
0,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.200,,,,,...,0,0,0,0,0,0,0,0,0,0
1,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CC1),0.075,,,...,0,0,0,0,0,0,0,0,0,0
2,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCC1),0.075,,,...,0,0,0,0,0,0,0,0,0,0
3,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCC1),0.075,,,...,0,0,0,0,0,0,0,0,0,0
4,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCCCC1),0.075,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
106,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
107,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
108,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0




Loading data: 100%|██████████| 82/82 [00:00<00:00, 1121.35it/s]
Creating Polyfingerprints: 100%|██████████| 82/82 [00:00<00:00, 83.92it/s] 


In [21]:
# 5th Cell: Load the reduction information used for training and apply it to the data.
reduction_data = dict(np.load(os.path.join(modeldir,"pfp_reduction.npz")))
red_fp = pfp.apply_reduction_fp_set([pfpd["pfp"] for pfpd in pfpdata],reduction_data["mask"],reduction_data['reference_fp'])
for i,pfpd in enumerate(pfpdata):
    pfpd["pfp"]=red_fp[i]

2023-12-01 14:00:40,152 - polyfingerprints - INFO - mean reduction loss is 0% with the highest loss per fingerprint beeing 0%


In [22]:
# 6th Cell: Load the model.
modelclass=getattr(pfp_models,hyperparameter["model"]["model"])
modelpath=os.path.join(modeldir,"best_model.ckpt")
model = modelclass.load_from_checkpoint(modelpath,**hyperparameter["model"]["model_hp"])

In [23]:
# 7th Cell: Run the inference.
x, y = pfp.loader.to_input_output_data(pfpdata)
yhat=model(torch.from_numpy(x).float()).detach().numpy()

In [24]:
# 8th Cell: Add the cloud point to the dataframe and save it.
infer_df.loc[df.index,"cloud_point"]=yhat
infer_df.to_csv(INFERCSV,index=False)
infer_df

Unnamed: 0,SMILES_start_group,SMILES_end_group,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,molpercent_repeating_unitB,SMILES_repeating_unitC,molpercent_repeating_unitC,SMILES_repeating_unitD,molpercent_repeating_unitD,...,polymer_concentration_wpercent,additive1,additive1_concentration_molar,additive1_concentration_weight_percent,additive2,additive2_concentration_molar,additive2_concentration_weight_percent,def_type,pH,cloud_point
0,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.200,,,,,...,0.01,,,,,,,0.1,7.0,41.751705
1,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CC1),0.075,,,...,0.01,,,,,,,0.1,7.0,42.215050
2,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCC1),0.075,,,...,0.01,,,,,,,0.1,7.0,37.960171
3,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCC1),0.075,,,...,0.01,,,,,,,0.1,7.0,43.228756
4,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCCCC1),0.075,,,...,0.01,,,,,,,0.1,7.0,42.251713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.01,[Rb+].[Cl-],1.0,,,,,0.1,7.0,17.057882
106,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.01,[Rb+].[Cl-],2.0,,,,,0.1,7.0,11.018854
107,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.01,[Na+].[Cl-],1.0,,,,,0.1,7.0,21.048828
108,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.01,[Na+].[Cl-],2.0,,,,,0.1,7.0,16.749990
