In [17]:
import polyfingerprints as pfp 
import pandas as pd
import numpy as np
from rdkit import Chem

In [18]:
SEED = 42
RAW_CSV_PATH =  "./cloud_points_data.csv"

In [19]:
raw_df = pd.read_csv(RAW_CSV_PATH, sep=";", decimal=",")
raw_df

Unnamed: 0,reference,polymer_type,polymer_type_style,polymer_architecture,polymerisation_type,SMILES_start_group,SMILES_end_group,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,...,additive2_concentration_molar,additive2_concentration_weight_percent,cloud_point,N/A,def_type,pH,identifier,comment,tacticity,rating
0,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,32.1,,A,,PNIPAM-co-AA,,,
1,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,43.0,,A,,PNIPAM-cycloprop-7.5,,,
2,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,44.0,,A,,PNIPAM-cyclobut-7.5,,,
3,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,45.5,,A,,PNIPAM-cyclopent-7.5,,,
4,10.1016/j.fuel.2016.10.075,random,,linear,FRP,[C](C)(C)(C#N),[C](C)(C)C#N,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),...,,,43.0,,A,,PNIPAM-cyclohept-7.5,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,10.1021/ma1005759,homo,,graft,RAFT,[C](C)(C#N)(CCC(=O)NCC(F)(F)C(F)(F)C(F)(F)C(F)...,[CH2]CCCCCCCCCCCCCCC,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,...,,,51.7,,C,,P15,632 nm,,
907,10.1021/ma1005759,homo,,graft,RAFT,[C](C)(C#N)(CCC(=O)NCC(F)(F)C(F)(F)C(F)(F)C(F)...,[S](SCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)...,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,...,,,50.3,,C,,P16,632 nm,,
908,10.1021/ma1005759,homo,,graft,RAFT,[C](C)(C#N)(CCC(=O)N(CCCCCCCCCCCCCCCCCC)CCCCCC...,[S]SC,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,...,,,48.9,,C,,P17,632 nm,,
909,10.1021/ma1005759,homo,,graft,RAFT,[C](C)(C#N)(CCC(=O)N(CCCCCCCCCCCCCCCCCC)CCCCCC...,[S](SCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)...,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,...,,,48.9,,C,,P18,632 nm,,


In [22]:
def check_additive(additive, weight_percent=None, concentration=None):
    if pd.isnull(additive):
        return {}

    mol = Chem.MolFromSmiles(additive)
    if mol is None:
        raise ValueError(f"additive {additive} is not a valid SMILES")

    if weight_percent is not None:
        conc = weight_percent / Chem.Descriptors.MolWt(mol) * 1000

    if concentration is not None:
        if weight_percent is not None:
            if not np.isclose(concentration, conc, rtol=1e-2, atol=1e-3):
                raise ValueError(
                    f"concentration {concentration} and weight_percent {weight_percent} (conc={conc}) do not match"
                )

        conc = concentration

    # seperate cations and anions

    frags = list(Chem.GetMolFrags(mol, asMols=True))
    smiles = [Chem.MolToSmiles(frag) for frag in frags]
    unique_smiles = {}
    for s in smiles:
        if s not in unique_smiles:
            unique_smiles[s] = 1
        else:
            unique_smiles[s] += 1

    return {k: v * conc for k, v in unique_smiles.items()}
    
def raw_data_to_dataset(df):
    out_data = []
    unused_columns = list(df.columns)
    ignored_columns = [
        "reference",
        "polymer_type",
        "polymer_type_style",
        "polymer_architecture",
        "polymerisation_type",
        "Mw",
        "PDI",
        "mass_characterisation_method",
        "mass_characterisation_standart",
        "N/A",
        "identifier",
        "comment",
        "tacticity",
        "rating",
    ]
    for c in ignored_columns:
        if c in unused_columns:
            unused_columns.remove(c)

    repeating_unit_columns = []
    for c in "ABCDEFGHIJKL":
        if (
            "SMILES_repeating_unit" + c in df.columns
            and "molpercent_repeating_unit" + c in df.columns
        ):
            repeating_unit_columns.append(c)

            unused_columns.remove("SMILES_repeating_unit" + c)
            unused_columns.remove("molpercent_repeating_unit" + c)

    additive_columns = []
    for c in [str(x) for x in range(1, 20)]:
        if (
            "additive" + c in df.columns
            and f"additive{c}_concentration_weight_percent" in df.columns
        ):
            unused_columns.remove("additive" + c + "_concentration_molar")
            unused_columns.remove("additive" + c)
            if f"additive{c}_concentration_weight_percent" in unused_columns:
                unused_columns.remove(f"additive{c}_concentration_weight_percent")

            additive_columns.append(c)
    # replace nans in pH with 7
    df["pH"] = df["pH"].fillna(7)
    unused_columns.remove("pH")

    df["def_type"] = df["def_type"].fillna("0.1")

    DEF_TYPE_MAP = {
        "A": 0.1,
        "B": 0.2,
        "C": 0.5,
        "DSC": 0.01,
    }
    df["def_type"] = df["def_type"].apply(
        lambda x: float(
            (DEF_TYPE_MAP[x] if x in DEF_TYPE_MAP else str(x).replace(",", "."))
        ),
    )

    unused_columns.remove("def_type")

    df["SMILES_start_group"] = df["SMILES_start_group"].fillna("[H]")
    unused_columns.remove("SMILES_start_group")

    df["SMILES_end_group"] = df["SMILES_end_group"].fillna("[H]")
    unused_columns.remove("SMILES_end_group")

    unused_columns.remove("Mn")
    unused_columns.remove("cloud_point")

    # wt%|mass fraction and mass concentration in g/mL are approximately the same for water
    df["poly_conc"] = df["polymer_concentration_wpercent"]
    # set poly_conc to polymer_concentration_mass_conc where it is NaN
    df["poly_conc"] = df["poly_conc"].fillna(df["polymer_concentration_mass_conc"])
    unused_columns.remove("polymer_concentration_wpercent")
    unused_columns.remove("polymer_concentration_mass_conc")

    all_additives = []
    for rowindex, row in df.iterrows():
        rowdat = {}
        for c in repeating_unit_columns:
            rowdat["SMILES_repeating_unit" + c] = row["SMILES_repeating_unit" + c]
            rowdat["molpercent_repeating_unit" + c] = row[
                "molpercent_repeating_unit" + c
            ]

        rowdat["pH"] = row["pH"]
        rowdat["def_type"] = row["def_type"]
        rowdat["SMILES_start_group"] = row["SMILES_start_group"]
        rowdat["SMILES_end_group"] = row["SMILES_end_group"]
        rowdat["Mn"] = row["Mn"]
        rowdat["poly_conc"] = row["poly_conc"]
        rowdat["cloud_point"] = row["cloud_point"]

        additives = {}
        for c in additive_columns:
            conc = row["additive" + c + "_concentration_molar"]
            w_perc = row[f"additive{c}_concentration_weight_percent"]
            if np.isnan(conc) and np.isnan(w_perc):
                if not pd.isnull(row["additive" + c]):
                    warn(
                        f"additive {rowindex} {row['additive' + c]} has no concentration"
                    )
                continue

            for a, aconc in check_additive(
                row["additive" + c],
                weight_percent=w_perc if not np.isnan(w_perc) else None,
                concentration=conc if not np.isnan(conc) else None,
            ).items():
                if a in additives:
                    additives[a] += aconc
                else:
                    additives[a] = aconc
                if a not in all_additives:
                    all_additives.append(a)

        for k, v in additives.items():
            rowdat[f"additive_{all_additives.index(k)}"] = v
        out_data.append(rowdat)

    parseddf = pd.DataFrame(out_data)
    for i, _ in enumerate(all_additives):
        parseddf[f"additive_{i}"] = parseddf[f"additive_{i}"].fillna(0)

    parseddf["log_Mn"] = np.log10(parseddf["Mn"]) / 6

    # infos
    all_ru_mp = parseddf[
        [c for c in parseddf.columns if "molpercent_repeating_unit" in c]
    ].values.flatten()
    all_ru = parseddf[
        [c for c in parseddf.columns if "SMILES_repeating_unit" in c]
    ].values.flatten()[~np.isnan(all_ru_mp)]

    info = {
        "def_types": list(df["def_type"].unique()),
        "num_unique_ru": len(set(all_ru)),
        # "unique_ru": list(set(all_ru)),
        "num_unique_additives": len(all_additives),
        "additives": all_additives,
        "repeating_unit_columns": [
            f"SMILES_repeating_unit{c}" for c in repeating_unit_columns
        ],
        "repeating_unit_molpercent_columns": [
            f"molpercent_repeating_unit{c}" for c in repeating_unit_columns
        ],
    }

    return parseddf, info

In [26]:
parseddf, info = raw_data_to_dataset(raw_df)



In [27]:
info

{'def_types': [0.1, 0.5, 0.05, 0.2, 0.01],
 'num_unique_ru': 87,
 'num_unique_additives': 34,
 'additives': ['[Na+]',
  '[Cl-]',
  'NCC(=O)O',
  '[N-]=[N+]=[N-]',
  '[Li+]',
  '[Cs+]',
  '[Rb+]',
  '[K+]',
  '[I-]',
  '[Br-]',
  '[F-]',
  '[OH-]',
  'O=S(=O)([O-])[O-]',
  '[NH4+]',
  'CC(=O)[O-]',
  'CCCC[N+](CCCC)(CCCC)CCCC',
  'COS(=O)(=O)[O-]',
  'CCCCOS(=O)(=O)[O-]',
  'CCCCCOS(=O)(=O)[O-]',
  'CCCCCCOS(=O)(=O)[O-]',
  'CCCCCCCCOS(=O)(=O)[O-]',
  'CCCCCCCCCOS(=O)(=O)[O-]',
  'CCCCCCCCCCOS(=O)(=O)[O-]',
  'CCCCCCCCCCCCOS(=O)(=O)[O-]',
  'O=P([O-])([O-])O',
  '[H+]',
  'O=P([O-])(O)O',
  '[15NH2][C@@H](Cc1c[15nH]c2ccccc12)C(=O)O',
  'N[C@H](Cc1c[nH]c2ccccc12)C(=O)O',
  '[O-][Cl+3]([O-])([O-])[O-]',
  'N#C[S-]',
  'OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H](O[C@H]4[C@H](O)[C@@H](O)[C@@H](O[C@H]5[C@H](O)[C@@H](O)[C@@H](O[C@H]6[C@H](O)[C@@H](O)[C@@H](O[C@H]7[C@H](O)[C@@H](O)[C@@H](O[C@H]1[C@H](O)[C@H]2O)O[C@@H]7CO)O[C@@H]6CO)O[C@@H]5CO)O[C@@H]4CO)O[C@@H]3CO',
  'COC[C@H]1O[C@@H]2O[C

In [28]:
parseddf

Unnamed: 0,SMILES_repeating_unitA,molpercent_repeating_unitA,SMILES_repeating_unitB,molpercent_repeating_unitB,SMILES_repeating_unitC,molpercent_repeating_unitC,SMILES_repeating_unitD,molpercent_repeating_unitD,SMILES_repeating_unitE,molpercent_repeating_unitE,...,additive_25,additive_26,additive_27,additive_28,additive_29,additive_30,additive_31,additive_32,additive_33,log_Mn
0,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.200,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.676783
1,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CC1),0.075,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.678031
2,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCC1),0.075,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.679864
3,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCC1),0.075,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.681060
4,[CH2][CH](C(=O)NC(C)C),0.8,[CH2][CH](C(=O)O),0.125,[CH2][CH](C(=O)NC1CCCCCC1),0.075,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.682818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593914
907,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600704
908,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.606081
909,[CH2][C](C)(C(=O)OCCOCCOCCOCCOC),1.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.613540
