In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import unifyer as uf

In [2]:
from importlib import reload
reload(uf);

In [3]:
df = pd.read_csv("data/LLM_unif.csv") # dataset generated by LLM with units 

# Prepare LLM Dataset

## Rename synonyms

For example: 
```
'sheet-like': 'sheet',
'nanosheets': 'sheet',
'Nanosheets': 'sheet',
```

In [4]:
df["shape_unif"] = df["shape"].apply(uf.unify_shape)

In [5]:
df["activity_unif"] = df["activity"].apply(uf.unify_activity)

In [6]:
df["polymer_unif"] = df["polymer_used_in_synthesis"].apply(uf.unify_polymer)

In [7]:
df["surfactant_unif"] = df["surfactant"].apply(uf.unify_surfactant)

## Rename dataset columns according to 'start_df.csv'

In [8]:
usefull_columns_and_rename = {
    'formula': 'formula',
    # no type
    'activity_unif': 'activity',
    # no Syngony
    'shape_unif': 'shape',
    'length_mean': 'length, nm',
    'width_mean': 'width, nm',
    'depth_mean': 'depth, nm',
    'size_mean': 'size, nm', # only in my dataset
    # no Sufrace
    # no surface
    'polymer_unif': 'pol',
    'surfactant_unif': 'surf',
    # no Mw(coat), g/mol
    'km_unif': 'Km, mM',
    'vmax_unif': 'Vmax, mM/s',
    'reaction_type': 'ReactionType',
    # no Subtype
    'c_min_unif': 'C min, mM',
    # no Vsub_min(mL)
    'c_max_unif': 'C max, mM',
    # no Vsub_max(mL)
    'concentration_of_co_substrate_unif': 'C(const), mM',  #  C(const)
    # no Vsub_const(mL)
    'concentration_of_nanoparticles_unif': 'Ccat(mg/mL)', #  C(cat)
    # no Vcat(mL)
    # no Ccat(mkM)
    'p_h': 'ph',
    'temperature': 'temp, °C',
    # no 'Vbuffer(mL)', 'Dstr', 'mX', 'mROx', 'mCD', 'volume', 'Mr, g/mol'
    'zeta_potential_unif': 'Zpotential',
    'surface_area_unif': 'SurfaceArea, m^2/g',
}

In [9]:
df = df[list(usefull_columns_and_rename.keys())].rename(columns=usefull_columns_and_rename)

## Fill length, width, depth with sizse if Nan

In [10]:
cols_to_fill = ['length, nm', 'width, nm', 'depth, nm']
for col in cols_to_fill:
    df[col] = df[col].fillna(df['size, nm'])

## Remove temperature units

In [11]:
import re
def convert_temp(temp_str):
    
    if uf.check_nan(temp_str):
        return np.nan
    temp_str = temp_str.replace("~", "").lower().replace("_", " ")
    
    match = re.match(r"(\d+(?:\.\d+)?)\s*\-\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        mn, mx = match.groups()
        return (float(mn)+float(mx))/2

    match = re.match(r"(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    match = re.match(r"(\d+(?:\.\d+)?)\s*±\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    print(temp_str)
    return np.nan
df["temp, °C"] = df["temp, °C"].apply(convert_temp)

298 k
298 k
298 k
298 k
298 k
298 k
298 k
298 k
298.15 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
300 k
300 k
300 k
300 k
298.15 k
293 k
293 k
293 k
293 k
295 k
295 k
293 k
293 k
293 k
293 k
293 k
5 min
5 min
298 k
298 k
303.1 k
303.1 k
ambient
303 k
303 k
ambient
298 k
278-328 k
310.00 k
310.00 k
20 - -30 °c
20 - -30 °c
25 degree c
25 degree c
298 k
298 k
298 k
298 k
310 k
4 degrees centigrade
4 degrees centigrade
8 k
8 k
ambient
295 k
300 k
300 k
300 k
298 k
298 k
298 k
298 k
298 k
298 k
25 degrees c
25 degrees c
307 k
298 k
298 k
298 k
50 # einsteins/m2/s
365 millidegree
83-88 f
85-88 f
79-90 f
85-88 f
79-90 f
79-90 f
363 k
298 k
298 k
303 k
303 k
303 k
310 k
298 k
310 k
310 k
310 k
298 k
298 k
298 k
343 k
305 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
room temperature
room temperature
room temperature
room temperature
300 k
300 k
298.15 k
298.15 k
298.15 k
298 k
298 k
298 k
298 k
140 f


# Make descriptors

In [12]:
import functions.functions_a as fu

In [13]:
reload(fu);

In [14]:
res_df = pd.DataFrame()

## X

In [15]:
def createX(formula):
    if uf.check_nan(formula):
        return np.nan
    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition {formula}")
        return np.nan

    X = composition.average_electroneg
    return X

In [16]:
res_df["X"] = df["formula"].apply(createX)

ERR creating composition N-CQDs
ERR creating composition N-CQDs
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition BSA-PtAu@CNS
ERR creating composition BSA-PtAu@CNS
ERR creating composition Cys-AuNCs
ERR creating composition CaO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition Fe3O4/Pβ-CD
ERR creating composition Fe3O4/Pβ-CD
ERR creating composition SeO32-
ERR creating composition (CyaSe)2
ERR creating composition 3-amino-9-ethylcarbazole
ERR creating composition o-tolidine
ERR creating composition FeIII–TMPyP
ERR creating composition AuNPs/MCA
ERR creating composition AuNPs/MCA
ERR creating composition Mn3O4/Pd@Pt
ERR creating composition Mn3O4/Pd@Pt
ERR creating composition P@Pt@P-Au
ERR creating compositi

  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms
  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms
  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms


## IR

In [17]:
def createIR(formula):
    if uf.check_nan(formula):
        return np.nan

    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition {formula}")
        return np.nan

    lol = fu.elfromcomp(composition)
    res = []
    for i in lol.items():
        try:
            element = fu.Element(i[0])
        except:
            print(f"ERR creating element {formula} {i[0]}")
            continue

        if element.is_metal:
              rd = element.atomic_radius
              res.append(rd)
    
    res = [float(s) for s in res if s != None and not uf.check_nan(s)]
    if len(res) > 0:
        IR = np.mean(res)
        return IR

    return np.nan

In [18]:
res_df["IR"] = df["formula"].apply(createIR)

ERR creating composition N-CQDs
ERR creating composition N-CQDs
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition BSA-PtAu@CNS
ERR creating composition BSA-PtAu@CNS
ERR creating element HRP R
ERR creating element HRP R
ERR creating composition Cys-AuNCs
ERR creating element H10Q Q
ERR creating element H10Q Q
ERR creating element H10A A
ERR creating element H10A A
ERR creating composition CaO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating element GSH G
ERR creating composition Fe3O4/Pβ-CD
ERR creating composition Fe3O4/Pβ-CD
ERR creating element HRP R
ERR creating composition SeO32-
ERR creating composition (CyaSe)2
ERR creating composition 3-amino-9-ethylcarbazole
ERR creating composition o-tolidine
ERR creating compo

  syms = sorted(sym_amt, key=lambda sym: get_el_sp(sym).X)
  syms = sorted(sym_amt, key=lambda sym: get_el_sp(sym).X)


ERR creating composition 1a
ERR creating composition 1a
ERR creating composition Fe-BDC-NH2
ERR creating composition Fe-BDC-NH2
ERR creating composition GO-Se
ERR creating composition GO-Se
ERR creating composition Pd/Fe3O4-PEI-RGO
ERR creating composition Cu3(PO4)2·3H2O
ERR creating composition Cu3(PO4)2·3H2O
ERR creating composition PLPC-OOH
ERR creating composition AD-100
ERR creating element FCG G
ERR creating composition Cu,ZnSOD
ERR creating composition CeO2/NiO
ERR creating composition CeO2/NiO
ERR creating element HRP R
ERR creating element AtGLB1 G
ERR creating element AtGLB1 L
ERR creating element AtGLB2 G
ERR creating element AtGLB2 L
ERR creating element AtGLB3 G
ERR creating element AtGLB3 L
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating composition ferritin–heme
ERR creating 

  syms = sorted(sym_amt, key=lambda sym: get_el_sp(sym).X)


## pot2

In [19]:
def createPot2(formula):
    if uf.check_nan(formula):
        return np.nan

    try:
        intindex_comp = fu.intindex(formula)
    except:
        print(f"ERR creating intindex {formula}")
        return np.nan

    try:
        os_comp = fu.OS(intindex_comp)
    except:
        print(f"ERR creating os_comp {formula} {intindex_comp}")
        return np.nan

    redox_list = []
    for i in os_comp.items():
        
        try:
            elem = fu.Element(i[0])
        except:
            print(f"ERR creating element {formula} {i[0]}")
            return np.nan

        if elem.is_metal:
            if i[1] <= 0:
                redox_list.append(0)
            else:
                redox_list.append(fu.Redox(i[0], int(round(i[1]))))

    redox_list = [s for s in redox_list if s != None]
    if len(redox_list) > 0:
        redox = np.mean(redox_list)
        return redox
    return np.nan

In [None]:
res_df["pot2"] = df["formula"].apply(createPot2)

ERR creating intindex N-CQDs
ERR creating intindex N-CQDs
ERR creating intindex u-cytc550
ERR creating intindex u-cytc550
ERR creating intindex u-cytc550
ERR creating intindex u-cytc550
ERR creating intindex BSA-PtAu@CNS
ERR creating intindex BSA-PtAu@CNS
ERR creating os_comp HRP R1 P1 H1
ERR creating os_comp HRP R1 P1 H1
ERR creating intindex Cys-AuNCs
ERR creating os_comp H10Q Q1 H10
ERR creating os_comp H10Q Q1 H10
ERR creating os_comp H10A A1 H10
ERR creating os_comp H10A A1 H10
ERR creating intindex CaO2/DOX
ERR creating intindex CaO2/DOX@SiO2/DOX
ERR creating intindex CaO2/DOX@SiO2/DOX-MnO2
ERR creating intindex CaO2/DOX@SiO2/DOX-MnO2
ERR creating intindex CaO2/DOX@SiO2/DOX-MnO2


  syms = sorted(sym_amt, key=lambda sym: get_el_sp(sym).X)


ERR creating os_comp GSH G1 H1 S1
ERR creating intindex Fe3O4/Pβ-CD
ERR creating intindex Fe3O4/Pβ-CD
ERR creating os_comp HRP R1 P1 H1
ERR creating intindex SeO32-
ERR creating intindex (CyaSe)2
ERR creating intindex 3-amino-9-ethylcarbazole
ERR creating intindex o-tolidine
ERR creating intindex FeIII–TMPyP
ERR creating os_comp HRP R1 P1 H1
ERR creating os_comp GrdB Grd1 B1
ERR creating os_comp GrdB Grd1 B1
ERR creating os_comp GrdB Grd1 B1
ERR creating intindex AuNPs/MCA
ERR creating intindex AuNPs/MCA
ERR creating intindex Mn3O4/Pd@Pt
ERR creating intindex Mn3O4/Pd@Pt
ERR creating intindex P@Pt@P-Au
ERR creating intindex P@Pt@P-Au
ERR creating intindex P@Pt@P-Au
ERR creating os_comp Cu 1.8S Cu180 S100
ERR creating os_comp Cu 1.8S Cu180 S100
ERR creating os_comp K12[Ga4L6] L6 K12 Ga4
ERR creating intindex Hb
ERR creating intindex Hb
ERR creating intindex Hb
ERR creating intindex AuNPs
ERR creating intindex AuNPs
ERR creating os_comp C18H16N2O2Se H16 C18 Se1 N2 O2
ERR creating os_comp

  syms = sorted(sym_amt, key=lambda sym: get_el_sp(sym).X)


ERR creating intindex Au-Hg/rGO
ERR creating intindex Au-Hg/rGO
ERR creating intindex Prostaglandin H Synthase
ERR creating intindex Prostaglandin H Synthase
ERR creating intindex Cyt c
ERR creating intindex Cyt c
ERR creating intindex SYI-2074
ERR creating intindex Cu(Nc)22+
ERR creating intindex Cu(Nc)22+
ERR creating intindex Cu2-xSe
ERR creating intindex Cu2-xSe
ERR creating intindex Cu2-xSe
ERR creating intindex Cu2-xSe
ERR creating intindex BLG-Heme
ERR creating intindex BLG-T-Heme
ERR creating intindex BLG-C-Heme
ERR creating intindex BLG-K-Heme
ERR creating intindex Crystallin-Heme
ERR creating os_comp RBCm@Ru@MnO2 R1 Cm1 Mn1 B1 Ru1 O2
ERR creating os_comp RBCm@Ru@MnO2 R1 Cm1 Mn1 B1 Ru1 O2
ERR creating os_comp RBCm@Ru@MnO2 R1 Cm1 Mn1 B1 Ru1 O2
ERR creating os_comp RBCm@Ru@MnO2 R1 Cm1 Mn1 B1 Ru1 O2
ERR creating intindex CD44 MMSN/AuNPs
ERR creating intindex CD44 MMSN/AuNPs
ERR creating intindex Au-Pt/SiO2
ERR creating intindex Au-Pt/SiO2
ERR creating intindex cSelS
ERR creating 

## ph

In [159]:
res_df["ph"] = df["ph"]

## temp

In [160]:
res_df["temp"] = df["temp, °C"]

## dstr

In [174]:
def createDstr(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan

    sizes_list = [l, w, d]
    sizes_list.sort(reverse=True)
    l, w, d = sizes_list
    

    big_diff_lw = l/w>6
    big_diff_ld = l/d>6

    if big_diff_lw and big_diff_ld:
        dstr = 1
    elif big_diff_lw or big_diff_ld:
        dstr = 2
    else:
        dstr = 3
    return dstr

In [186]:
res_df["dstr"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createDstr, axis=1)

## C (Cmin, Cmax, lgCmin ...)

In [189]:
res_df["Cmin"] = df["C min, mM"]
res_df["Cmax"] = df["C max, mM"]
res_df["lgCmin"] = np.log10(res_df["Cmin"])
res_df["lgCmax"] = np.log10(res_df["Cmax"])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [190]:
res_df["Cconst"] = df["C(const), mM"]
res_df["Ccat"] = df["Ccat(mg/mL)"]
res_df["lgCconst"] = np.log10(res_df["Cconst"])
res_df["lgCcat"] = np.log10(res_df["Ccat"])

## lgvolume

In [191]:
def createLgvolume(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan
    return np.log10(l*w*d)

In [192]:
res_df["lgvolume"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createLgvolume, axis=1)

## Km, Vmax

In [None]:
res_df["Km"] = df["Km, mM"]
res_df["Vmax"] = df["Vmax, mM/s"]

## activity

In [None]:
res_df["activity"] = df["activity"].apply(uf.map_activity)

## polymer, surfactant, reaction_type descriptors

In [44]:
history = {}
h_prop = {}
h_comp = {}

In [120]:
def get_gescriptors_by_names(names, use_history=True):
    '''
    names - List of names of chemicals
    '''

    if use_history and (names in history):
        return history[names]

    ### pubchem descriptors
    # mw = 0
    tpsa = 0
    comp = 0
    logp = 0
    for name in names:
        if name in h_prop:
            prop = h_prop[name]
        else:
            prop = fu.pcp.get_properties(
                ['MolecularWeight', 'XLogP','TPSA', 'Complexity'],
                fu.monomer(name), 
                'name'
            )
            h_prop[name] = prop
        
        if len(prop) < 1:
            # if molecue not found then set nan values
            print(f"ERR PCP not found get_properties {name}")
            # mw = np.nan
            tpsa = np.nan
            P = np.nan
            comp = np.nan
            break
        prop = prop[0]
            

        # polym = float(mcoat)*1000/float(mw) ## i don't have mcoat
        # mw += float(prop.get('MolecularWeight', np.nan))
        if len(names) == 1:
            logp = prop.get('XLogP', np.nan)
        tpsa += float(prop.get('TPSA', np.nan))
        comp = float(prop.get('Complexity', np.nan))

    # convert zeros to nan
    # mw = mw if mw else np.nan
    tpsa = tpsa if tpsa else np.nan
    logp = logp if logp else np.nan
    comp = comp if comp else np.nan

    desc_pubchem = {
        # "MolWt": mw,
        "XLogP": logp,
        "TPSA": tpsa,
        "Complexity": comp,
    }

    ### RDKit descriptors
    smiles_list = []
    for name in names:
        if name in h_comp:
            smiles_obj = h_comp[name]
        else:
            smiles_obj = fu.pcp.get_compounds(fu.monomer(name), 'name')
            h_comp[name] = smiles_obj

        if len(smiles_obj) < 1:
            print(f"ERR PCP not found get_compounds {name}")
            desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
            desc = {**desc_pubchem, **desc_rdkit}
            history[names] = desc
            return desc

        smiles_list.append(smiles_obj[0].isomeric_smiles)

    if len(smiles_list) < 1:
        desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
    else:
        mol = fu.Chem.MolFromSmiles(".".join(smiles_list))
        # print(smiles_list)
        allDescrs = fu.getMolDescriptors(mol)
        # print(allDescrs.keys())
        desc_rdkit = {
            "MolWt": allDescrs["MolWt"],
            "PEOE_VSA7": allDescrs["PEOE_VSA7"],
            "PEOE_VSA9": allDescrs["PEOE_VSA9"],
            "VSA_EState8": allDescrs["VSA_EState8"],
            "Kappa2": allDescrs["Kappa2"],
            "BalabanJ": allDescrs["BalabanJ"],
            "MinAbsEStateIndex": allDescrs["MinAbsEStateIndex"],
            "MinEStateIndex": allDescrs["MinEStateIndex"],
            "EState_VSA6": allDescrs["EState_VSA6"],
            "VSA_EState4": allDescrs["VSA_EState4"],
            "PEOE_VSA8": allDescrs["PEOE_VSA8"],
            "MinPartialCharge": allDescrs["MinPartialCharge"],
            "EState_VSA4": allDescrs["EState_VSA4"],
            "SMR_VSA7": allDescrs["SMR_VSA7"],
            "BCUT2D_CHGLO": allDescrs["BCUT2D_CHGLO"],
            "MaxEStateIndex": allDescrs["MaxEStateIndex"],
            "MaxPartialCharge": allDescrs["MaxPartialCharge"],
        }

    desc = {**desc_pubchem, **desc_rdkit}
    history[names] = desc
    return desc

In [23]:
from tqdm.notebook import tqdm
tqdm.pandas()

### Check how well this work

In [24]:
final_df = pd.read_csv("data/existing_datasets/final_df.csv")
start_df = pd.read_csv("data/existing_datasets/start_df.csv")

In [126]:
def get_row_descriptors(row, chemicals_columns = ["surface", "pol", "surf"]):
    rd = {}
    
    ### descriptors of surface, pol, surf
    chemicals = [row[col] for col in chemicals_columns]
    chemicals = [ch for ch in chemicals if ch not in ["0", "nan", "naked"] and not uf.check_nan(ch)]
    chemicals = tuple(chemicals)
    # print(chemicals)

    chemicals_descriptors = get_gescriptors_by_names(chemicals)
    # remove descriptors that aren't calculated for (surface, pol, surf)
    chemicals_descriptors = {key: value for key, value in chemicals_descriptors.items() if key not in ["MaxEStateIndex", "MaxPartialCharge", "Complexity"]}
    # print(chemicals_descriptors)
    rd.update(chemicals_descriptors)

    ### descriptors of reaction_type (ex.: TMB + H2O2)
    rd[f"MinPartialCharge.1"] = np.nan
    rd[f"MaxPartialCharge.1"] = np.nan
    rd[f"Complexity1"] = np.nan
    for i in range(2):
        rd[f"TPSA{i+1}"] = np.nan
        rd[f"MaxEStateIndex.{i+1}"] = np.nan


    if uf.check_nan(row["ReactionType"]):
        return rd
        

    chems = row["ReactionType"].replace(" + ", "+").split("+")
    chems = [ch.lstrip().rstrip() for ch in chems if ch not in ["0", "nan", ""] and not uf.check_nan(ch)]
    if len(chems) > 2:
        print("A lot of elements in reaction: ", "id", row["ReactionType"], chems)
        return rd

    for i, chem in enumerate(chems):
        desc = get_gescriptors_by_names((chem,))
        rd[f"TPSA{i+1}"] = desc["TPSA"]
        rd[f"MaxEStateIndex.{i+1}"] = desc["MaxEStateIndex"]
        if i == 0:
            rd[f"Complexity{i+1}"] = desc["Complexity"]
            rd[f"MinPartialCharge.{i+1}"] = desc["MinPartialCharge"]
            rd[f"MaxPartialCharge.{i+1}"] = desc["MaxPartialCharge"]

    return rd

In [50]:
from collections import defaultdict

In [118]:
n_of_errors_by_key

defaultdict(int,
            {'MinPartialCharge.1': 27,
             'MaxPartialCharge.1': 27,
             'Complexity1': 43,
             'TPSA1': 43,
             'MaxEStateIndex.1': 25,
             'TPSA2': 155,
             'MolWt': 56,
             'XLogP': 10,
             'TPSA': 76,
             'PEOE_VSA7': 49,
             'PEOE_VSA9': 52,
             'Kappa2': 60,
             'BalabanJ': 22,
             'MinAbsEStateIndex': 28,
             'MinEStateIndex': 18,
             'EState_VSA6': 29,
             'VSA_EState4': 29,
             'PEOE_VSA8': 39,
             'MinPartialCharge': 17,
             'SMR_VSA7': 45,
             'BCUT2D_CHGLO': 17,
             'EState_VSA4': 42,
             'VSA_EState8': 44,
             'MaxEStateIndex.2': 1})

In [121]:
n_of_good_convertions = 0
n_of_errors_by_key = defaultdict(int)
n_of_errors_by_row_id = defaultdict(int)

for i in range(1000):
    row = start_df.loc[i]
    id_ = row["#"]
    
    row_descriptors = get_row_descriptors(row)

    data = final_df[final_df["id"] == id_][[key for key in row_descriptors.keys()]]
    if data.shape[0] < 1:
        continue
        
    for key in row_descriptors.keys():
        eps = 0.01
        if "TPSA" in key:
            eps = 3

        orig = data[key].tolist()[0]
        
        # orig != 0 and descriptor is nan
        if orig > eps and uf.check_nan(row_descriptors[key]):
            if key == "XLogP":
                continue
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

        if abs(orig - row_descriptors[key])>eps:
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

    if n_of_errors_by_row_id[id_] == 0:
        n_of_good_convertions += 1

print(n_of_good_convertions)

23 MinPartialCharge.1 -0.744035145 -0.34392573610777344
23 MaxPartialCharge.1 0.211204803 0.29406472784425514
23 Complexity1 0.0 910.0
23 TPSA1 883.0 209.0
23 MaxEStateIndex.1 11.3319983 11.447015664231106
26 MinPartialCharge.1 -0.744035145 -0.34392573610777344
26 MaxPartialCharge.1 0.211204803 0.29406472784425514
26 Complexity1 0.0 910.0
26 TPSA1 883.0 209.0
26 MaxEStateIndex.1 11.3319983 11.447015664231106
91 MinPartialCharge.1 -0.744035145 -0.34392573610777344
91 MaxPartialCharge.1 0.211204803 0.29406472784425514
91 Complexity1 0.0 910.0
91 TPSA1 883.0 209.0
91 MaxEStateIndex.1 11.3319983 11.447015664231106
97 MinPartialCharge.1 -0.744035145 -0.34392573610777344
97 MaxPartialCharge.1 0.211204803 0.29406472784425514
97 Complexity1 0.0 910.0
97 TPSA1 883.0 209.0
97 MaxEStateIndex.1 11.3319983 11.447015664231106
117 MinPartialCharge.1 -0.744035145 -0.34392573610777344
117 MaxPartialCharge.1 0.211204803 0.29406472784425514
117 Complexity1 0.0 910.0
117 TPSA1 883.0 209.0
117 MaxEStateInd

### apply to my data

In [128]:
new_rows = []
for row in tqdm(df.iloc, total=len(df)):
    new_rows.append(get_row_descriptors(row, ['pol', 'surf']))
new_df = pd.DataFrame(new_rows)

  0%|          | 0/9045 [00:00<?, ?it/s]

A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id O2^- + O2^- + 2H+ -> H2O2 + O2 ['O2^-', 'O2^-', '2H', '-> H2O2', 'O2']
A lot of elements in reaction:  id Ag+ + TMB+H2O2 ['Ag', 'TMB', 'H2O2']
A lot of elements in reaction:  id H2O2 + a-naphthol + p-phenylenediamine ['H2O2', 'a-naphthol', 'p-phenylenediamine']
ERR PCP not found get_properties Mn2
ERR PCP not found get_compound

KeyboardInterrupt: 