In [1]:
import polyfingerprints as pfp
import pandas as pd
import numpy as np
from wrapconfig import YAMLWrapConfig
import os
import shutil
import matplotlib.pyplot as plt
from polyfingerprints import models as pfp_models
import torch

In [17]:
OUTDIR="out"
BASEMODEL_DIR=os.path.join(OUTDIR,"models")
MODELNAME="test"
INFERXLSX="excel_infer_template.xlsx"
TARGET_VALUE="numerical_attr2"
INFERCSV = os.path.join(OUTDIR,"infer.csv")

In [10]:
modeldir=os.path.abspath(os.path.join(BASEMODEL_DIR,MODELNAME))
traininfofile = YAMLWrapConfig(os.path.join(modeldir,"expanded_info.yml"),default_save=False)
hyperparameter=YAMLWrapConfig(os.path.join(modeldir,("hyperparameter.yml")),default_save=False)

inferdf = pd.read_excel(INFERXLSX)
inferdf

Unnamed: 0,SMILES_repeating_unit_1,molpercent_repeating_unit_1,SMILES_repeating_unit_A,molpercent_repeating_unit_A,SMILES_start_group,SMILES_end_group,Mn,additive_A,additive_A_weight_percent,numerical_attr,categorical_attribute
0,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1000,[Na+].[Cl-],0.1,10,B
1,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1500,[Na+].[Cl-],0.1,10,A


In [12]:
df, info = pfp.datareader.excel_to_data(INFERXLSX)
if TARGET_VALUE in df.columns:
    df=df.drop(TARGET_VALUE,axis=1)
    
additives_rename={}

for i,add in enumerate(info["additives"]):
    additives_rename[f"additive_{i}"]=f"additive_{traininfofile['additives'].index(add)}"

additives_missing=[f"additive_{i}" for i,add in enumerate(traininfofile["additives"]) if add not in info["additives"]]

df.rename(additives_rename,inplace=True,axis=1)
for madd in additives_missing:
    if madd in df.columns:
        raise ValueError("this should not happen")
    df[madd]=0

display(df)

additional_columns=traininfofile["numerical_columns"]+traininfofile["categorical_columns"]
if TARGET_VALUE in additional_columns:
    additional_columns.remove(TARGET_VALUE)

pfpdata = pfp.loader.df_loader(
        df=df,
        repeating_unit_columns=tuple(
            zip(
                info["repeating_unit_columns"],
                info["molpercent_repeating_unit_columns"],
            )
        ),
        mw_column="Mn",
        start_group_column="SMILES_start_group",
        end_group_column="SMILES_end_group",
        additional_columns= additional_columns,
        intersection_fp_size=hyperparameter["pfp_data"]["intersection_fp_size"],
        enhanced_sum_fp_size=hyperparameter["pfp_data"]["enhanced_sum_fp_size"]
    )

Unnamed: 0,SMILES_repeating_unit_1,molpercent_repeating_unit_1,SMILES_repeating_unit_A,molpercent_repeating_unit_A,SMILES_start_group,SMILES_end_group,Mn,numerical_attr,categorical_attribute,additive_1,additive_0
0,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1000,10,B,1.711069,1.711069
1,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1500,10,A,1.711069,1.711069


Loading data: 100%|██████████| 2/2 [00:00<00:00, 1199.40it/s]
Creating Polyfingerprints: 100%|██████████| 2/2 [00:00<00:00, 196.82it/s]


In [13]:
reduction_data = dict(np.load(os.path.join(modeldir,("pfp_reduction.npz"))))
red_fp = pfp.apply_reduction_fp_set([pfpd["pfp"] for pfpd in pfpdata],reduction_data["mask"],reduction_data['reference_fp'])
for i,pfpd in enumerate(pfpdata):
    pfpd["pfp"]=red_fp[i]

2023-11-30 20:47:26,527 - polyfingerprints - INFO - mean reduction loss is 0% with the highest loss per fingerprint beeing 0%


In [14]:
hyperparameter=YAMLWrapConfig(os.path.join(modeldir,"hyperparameter.yml"),default_save=False)
modelclass=getattr(pfp_models,hyperparameter["model"]["model"])
modelpath=os.path.join(modeldir,"best_model.ckpt")
model = modelclass.load_from_checkpoint(modelpath,**hyperparameter["model"]["model_hp"])

In [15]:
x, y = pfp.loader.to_input_output_data(pfpdata)
yhat=model(torch.from_numpy(x).float()).detach().numpy()

In [18]:
inferdf.loc[df.index,TARGET_VALUE]=yhat
inferdf.to_csv(INFERCSV,index=False)
inferdf

Unnamed: 0,SMILES_repeating_unit_1,molpercent_repeating_unit_1,SMILES_repeating_unit_A,molpercent_repeating_unit_A,SMILES_start_group,SMILES_end_group,Mn,additive_A,additive_A_weight_percent,numerical_attr,categorical_attribute,numerical_attr2
0,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1000,[Na+].[Cl-],0.1,10,B,0.375614
1,[CH2][C](C(=O)OC),1,[CH2][CH](C),1,[H],[H],1500,[Na+].[Cl-],0.1,10,A,0.391472
