In [77]:
import polyfingerprints as pfp 
import pandas as pd
import numpy as np
from rdkit import Chem
from tqdm.auto import tqdm
from wrapconfig import YAMLWrapConfig
import os
import shutil
import matplotlib.pyplot as plt
from polyfingerprints import models as pfp_models

In [46]:
SEED = 42
RAW_CSV_PATH =  "./cloud_points_infer.csv"
BASEMODEL_DIR="models"
MODELNAME="test4"

In [47]:
raw_df = pd.read_csv(RAW_CSV_PATH, sep=";", decimal=",")
raw_df

Unnamed: 0,reference,polymer_type,polymer_type_style,polymer_architecture,polymerisation_type,SMILES_start_group,SMILES_end_group,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,...,additive2_concentration_molar,additive2_concentration_weight_percent,cloud_point,N/A,def_type,pH,identifier,comment,tacticity,rating
0,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,32.10,,A,,PNIPAM-co-AA,,,
1,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,43.00,,A,,PNIPAM-cycloprop-7.5,,,
2,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,44.00,,A,,PNIPAM-cyclobut-7.5,,,
3,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,45.50,,A,,PNIPAM-cyclopent-7.5,,,
4,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,43.00,,A,,PNIPAM-cyclohept-7.5,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,10.1021/la0106440,homo,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,...,,,21.00,,A,,,read off diagram,,
106,10.1021/la0106440,homo,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,...,,,9.20,,A,,,read off diagram,,
107,10.1021/la0106440,homo,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,...,,,20.00,,A,,,read off diagram,,
108,10.1021/la0106440,homo,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,...,,,8.16,,A,,,read off diagram,,


In [48]:
raw_df["pH"] = raw_df["pH"].fillna(7)
DEF_TYPE_MAP = {
        "A": 0.1,
        "B": 0.2,
        "C": 0.5,
        "DSC": 0.01,
    }
raw_df["def_type"] = raw_df["def_type"].fillna("0.1")
raw_df["def_type"] = raw_df["def_type"].apply(
    lambda x: float(
        (DEF_TYPE_MAP[x] if x in DEF_TYPE_MAP else str(x).replace(",", "."))
    ),
)

# set poly_conc to polymer_concentration_mass_conc where it is NaN
if "polymer_concentration_mass_conc" in raw_df.columns:
    raw_df["polymer_concentration_wpercent"] = raw_df["polymer_concentration_wpercent"].fillna(raw_df["polymer_concentration_mass_conc"])
    raw_df= raw_df.drop(
        ["polymer_concentration_mass_conc"], axis=1
    )

df,info = pfp.datareader.expand_data(
    raw_df,
    ignored_columns=["reference",
        "polymer_type",
        "polymer_type_style",
        "polymer_architecture",
        "polymerisation_type",
        "Mw",
        "PDI",
        "mass_characterisation_method",
        "mass_characterisation_standart",
        "N/A",
        "identifier",
        "comment",
        "tacticity",
        "rating",],
    density=1,
additive_wp_column_postfix = "_concentration_weight_percent",
    additive_conc_column_postfix="_concentration_molar"
    

)


infofile = YAMLWrapConfig("expanded_info_infer.yml")
infofile.set_data(info)
infofile.save()
display(df)

Unnamed: 0,SMILES_start_group,SMILES_end_group,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,molpercent_repeating_unitB,SMILES_repeating_unitC,molpercent_repeating_unitC,SMILES_repeating_unitD,molpercent_repeating_unitD,...,def_type,pH,additive_0,additive_1,additive_2,additive_3,additive_4,additive_5,additive_6,additive_7
0,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.200,,,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CC1),0.075,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCC1),0.075,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCC1),0.075,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCCCC1),0.075,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
106,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.1,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
107,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.1,7.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
108,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),1.0,,,,,,,...,0.1,7.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0


In [49]:
modeldir=os.path.join(BASEMODEL_DIR,MODELNAME)
traininfofile = YAMLWrapConfig(os.path.join(modeldir,"expanded_info.yml"))

additives_rename={}

for i,add in enumerate(infofile["additives"]):
    additives_rename[f"additive_{i}"]=f"additive_{traininfofile["additives"].index(add)}"

additives_missing=[f"additive_{i}" for i,add in enumerate(traininfofile["additives"]) if add not in infofile["additives"]]

df.rename(additives_rename,inplace=True,axis=1)
for madd in additives_missing:
    if madd in df.columns:
        raise ValueError("this should not happen")
    df[madd]=0


df.to_csv("expanded_data_infer.csv")
df

In [74]:
additional_columns=traininfofile["numerical_columns"]+traininfofile["categorical_columns"]
y=["cloud_point"]

DEFAULT_PFPDATA={
    "intersection_fp_size":256,
    "enhanced_sum_fp_size":256,
}

hyperparameter=YAMLWrapConfig(os.path.join(modeldir,("hyperparameter.yml")))
pfpdata = pfp.loader.df_loader(
        df=df,
        repeating_unit_columns=tuple(
            zip(
                infofile["repeating_unit_columns"],
                infofile["molpercent_repeating_unit_columns"],
            )
        ),
        y=y,
        mw_column="Mn",
        start_group_column="SMILES_start_group",
        end_group_column="SMILES_end_group",
        additional_columns= additional_columns,
        intersection_fp_size=hyperparameter["pfp_data"]["intersection_fp_size"],
        enhanced_sum_fp_size=hyperparameter["pfp_data"]["enhanced_sum_fp_size"]
    )

Loading data: 100%|██████████| 82/82 [00:00<00:00, 911.06it/s]
Creating Polyfingerprints: 100%|██████████| 82/82 [00:00<00:00, 89.27it/s] 


In [75]:
reduction_data = dict(np.load(os.path.join(modeldir,("pfp_reduction.npz"))))
fps=[pfpd["pfp"] for pfpd in pfpdata]
red_fp = pfp.apply_reduction_to_pfp_in_dataset(fps,reduction_data["mask"],reduction_data['reference_fp'])
for i,pfpd in enumerate(pfpdata):
    pfpd["pfp"]=red_fp[i]

2023-11-10 13:55:30,508 - polyfingerprints - INFO - loss for the first fingerprint is 0%


In [78]:
modelclass=getattr(pfp_models,hyperparameter["model"]["model"])
model = modelclass(**hyperparameter["model"]["model_hp"])

AttributeError: module 'polyfingerprints.models' has no attribute 'FCCModel'

In [68]:
pfpdata[0]["pfp"]

array([1., 1., 1., ..., 0., 0., 0.])

In [65]:
reduction_data

{'mask': array([False, False, False, ..., False, False, False]),
 'reference_fp': array([1., 1., 1., ..., 0., 0., 0.])}

In [70]:
pfp.reduce_pfp_in_dataset

<function polyfingerprints.core.reduce_pfp_in_dataset(pfp_data: List[polyfingerprints._types.PfpData]) -> Tuple[List[polyfingerprints._types.PfpData], dict]>