In [177]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import functions.unifyer as uf
from tqdm.notebook import tqdm
tqdm.pandas()

In [178]:
from importlib import reload
reload(uf);

In [179]:
df = pd.read_csv("data/LLM_unif_2.csv") # dataset generated by LLM with units 

# Prepare LLM Dataset

## Rename synonyms

For example: 
```
'sheet-like': 'sheet',
'nanosheets': 'sheet',
'Nanosheets': 'sheet',
```

In [180]:
# df["shape_unif"] = df["shape"].apply(lambda v: uf.unify_synonyms(v, "shape")) shape не используется

In [181]:
df["activity_unif"] = df["activity"].apply(lambda v: uf.unify_synonyms(v, "activity"))

In [182]:
df["polymer_unif"] = df["polymer_used_in_synthesis"].apply(lambda v: uf.unify_synonyms(v, "polymer"))

In [183]:
df["surfactant_unif"] = df["surfactant"].apply(lambda v: uf.unify_synonyms(v, "surfactant"))

## Rename dataset columns according to 'start_df.csv'

In [184]:
usefull_columns_and_rename = {
    'formula': 'formula',
    # no type
    'activity_unif': 'activity',
    # no Syngony
    # 'shape_unif': 'shape', dont use shape
    'length_mean': 'length, nm',
    'width_mean': 'width, nm',
    'depth_mean': 'depth, nm',
    'size_mean': 'size, nm', # only in my dataset
    # no Sufrace
    # no surface
    'polymer_unif': 'pol',
    'surfactant_unif': 'surf',
    # no Mw(coat), g/mol
    'km_unif': 'Km, mM',
    'vmax_unif': 'Vmax, mM/s',
    'reaction_type': 'ReactionType',
    # no Subtype
    'c_min_unif': 'C min, mM',
    # no Vsub_min(mL)
    'c_max_unif': 'C max, mM',
    # no Vsub_max(mL)
    'concentration_of_co_substrate_unif': 'C(const), mM',  #  C(const)
    # no Vsub_const(mL)
    'concentration_of_nanoparticles_unif': 'Ccat(mg/mL)', #  C(cat)
    # no Vcat(mL)
    # no Ccat(mkM)
    'p_h': 'ph',
    'temperature': 'temp, °C',
    # no 'Vbuffer(mL)', 'Dstr', 'mX', 'mROx', 'mCD', 'volume', 'Mr, g/mol'
    'zeta_potential_unif': 'Zpotential',
    'surface_area_unif': 'SurfaceArea, m^2/g',
}

In [185]:
df = df[list(usefull_columns_and_rename.keys())].rename(columns=usefull_columns_and_rename)

## Remove bad formulas

In [186]:
df = df[df["formula"].apply(uf.is_valid_formula)].copy().reset_index(drop=True)
df

Unnamed: 0,formula,activity,"length, nm","width, nm","depth, nm","size, nm",pol,surf,"Km, mM","Vmax, mM/s",ReactionType,"C min, mM","C max, mM","C(const), mM",Ccat(mg/mL),ph,"temp, °C",Zpotential,"SurfaceArea, m^2/g"
0,Co@C,peroxidase,,,,,,,,,,,,,,,,,
1,Fe3O4,peroxidase,,,,,,,0.020,,H2O2+TMB,0.02,0.0200,40.00,0.50,5,55 °C,,
2,Fe3O4,peroxidase,,,,,,,0.020,,TMB+H2O2,40.00,40.0000,0.02,0.50,5,55 °C,,
3,NiO,peroxidase,,,,,,,0.010,0.001,TMB+H2O2,0.20,2.0000,5.00,0.02,3.5,50 °C,,415.0
4,NiO,peroxidase,,,,,,,0.050,0.002,H2O2+TMB,0.20,50.0000,0.60,0.02,3.5,50 °C,,415.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,Fe4[Fe(CN)6]3,peroxidase,,,,15.3,,Citric acid,0.010,0.001,H2O2+TMB,0.01,1.0000,0.50,,3.5,25 °C,,
2196,PtFe@Fe3O4,peroxidase,140.0,2.0,,,multi-monomer,Oleylamine,53.550,,H2O2+TMB,0.00,600.0000,,,4.5,37 °C,,
2197,PtFe@Fe3O4,peroxidase,140.0,2.0,,,multi-monomer,Oleylamine,0.213,,TMB+H2O2,0.00,0.8322,,,4.5,37 °C,,
2198,PtFe@Fe3O4,catalase,140.0,2.0,,,multi-monomer,Oleylamine,,,H2O2,,,,0.10,4.5,37 °C,,


## Fill length, width, depth with sizse if Nan

In [187]:
cols_to_fill = ['length, nm', 'width, nm', 'depth, nm']
for col in cols_to_fill:
    df[col] = df[col].fillna(df['size, nm'])

## Remove temperature units

In [188]:
import re
def convert_temp(temp_str):
    
    if uf.check_nan(temp_str):
        return np.nan
    temp_str = temp_str.replace("~", "").lower().replace("_", " ")
    
    match = re.match(r"(\d+(?:\.\d+)?)\s*\-\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        mn, mx = match.groups()
        return (float(mn)+float(mx))/2

    match = re.match(r"(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    match = re.match(r"(\d+(?:\.\d+)?)\s*±\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    print(temp_str)
    return np.nan
df["temp, °C"] = df["temp, °C"].apply(convert_temp)

298.15 k
295 k
295 k
25 degree c
25 degree c
298 k
298 k
298 k
298 k
363 k
298 k
298 k
303 k
 25 °c
 25 °c


# Make descriptors

In [189]:
import functions.functions_a as fu

In [190]:
reload(fu);

In [191]:
res_df = pd.DataFrame()

## X

In [192]:
import re

In [193]:
def createX(formula):
    if uf.check_nan(formula):
        return np.nan
        
    if "@" in formula:
        return createX(formula.split("@")[-1])
        
    if "/" in formula or "-" in formula:
        components = re.split(r'[-/]', formula)
        summ = 0
        for component in components:
            summ += createX(component)
        if uf.check_nan(summ):
            print(f"ERR calculating mean X: '{formula}'")
        return summ/len(components)

    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition '{formula}'")
        return np.nan

    X = composition.average_electroneg
    return X

In [194]:
res_df["X"] = df["formula"].apply(createX)

ERR creating composition ''
ERR calculating mean X: 'Se-'


### Check creating X on existing dataset 

In [195]:
st = pd.read_csv("./data/existing_datasets/start_df.csv")

In [196]:
fin = pd.read_csv("./data/existing_datasets/final_df.csv")

In [197]:
fin[fin["id"].isin(st[st["formula"].apply(createX).isna()]["#"])]

ERR creating composition '(Co,Mn)3O4'
ERR creating composition 'MoS2–Pt0.74Ag0.26'
ERR creating composition 'MoS2–Pt0.74Ag0.26'
ERR creating composition '4N'
ERR calculating mean X: '4N-TiO2'
ERR creating composition '4N'
ERR calculating mean X: '4N-TiO2'
ERR creating composition '4N'
ERR calculating mean X: '4N-TiO2'
ERR creating composition '4N'
ERR calculating mean X: '4N-TiO2'
ERR creating composition 'Fe–Mn'


Unnamed: 0,Km,Vmax,id,activity,X,IR,pot2,ph,temp,dstr,...,TPSA2,TPSA,XLogP,MaxEStateIndex.1,MaxEStateIndex.2,MinPartialCharge.1,MaxPartialCharge.1,BCUT2D_CHGLO,polym,Complexity
886,0.0072,0.0002086,948,2,2.499,1.205,-0.322,4.0,15.0,3,...,40.5,0.0,0.0,5.992739,0.0,-0.398299,0.037337,0.0,0.0,0.0
1080,25.71,7.29e-05,1150,1,2.3696,0.8976,0.5012,4.0,50.0,3,...,40.5,26.02,0.61005,5.992739,6.0,-0.398299,0.037337,-1.606233,1923.960794,17.0
1081,0.386,3.22e-05,1151,1,2.3696,0.8976,0.5012,4.0,50.0,3,...,52.0,26.02,0.610021,6.0,5.992739,-0.254557,-0.254557,-1.606233,1923.960794,17.0
1086,0.45,0.000115,1156,1,2.7432,1.0962,0.3222,7.4,25.0,3,...,40.5,0.0,0.0,5.992739,6.0,-0.398299,0.037337,0.0,0.0,0.0
1087,0.75,6.8e-05,1157,1,2.8122,1.1294,0.0146,7.4,25.0,3,...,52.0,0.0,0.0,6.0,5.992739,-0.254557,-0.254557,0.0,0.0,0.0
1088,0.35,7.1e-05,1158,1,2.096,0.926,0.78,7.4,25.0,3,...,40.5,86.24,102.954578,5.992739,6.0,-0.398299,0.037337,-1.806048,0.739195,98.0
1089,0.64,8.5e-05,1159,1,2.199,1.2616,0.944,7.4,25.0,3,...,52.0,86.24,102.935331,6.0,5.992739,-0.254557,-0.254557,-1.806048,0.739195,98.0
1104,0.2,0.0001025,1175,2,2.7432,1.0962,0.3222,3.0,25.0,3,...,40.5,0.0,0.0,5.992739,0.0,-0.398299,0.037337,0.0,0.0,0.0


In [198]:
createX("CoO")

2.66

In [199]:
createX("SeO2")

3.143333333333333

## IR

In [200]:
banned_oxi = ['Fe4[Fe(CN)6]3', 'C82(OH)22', 'C62(COOH)4', 'C60[C(COOH)2]2', 'C5H12N130Te']

In [201]:
def find_closest_number(arr, target):
    # Вычисляем разницу между каждым числом в массиве и целевым числом
    differences = [abs(num - target) for num in arr]

    # Находим индекс минимальной разницы
    closest_index = differences.index(min(differences))

    # Возвращаем число с этим индексом
    return arr[closest_index]

def createIR(formula):
    global banned_oxi
    if uf.check_nan(formula) or formula in banned_oxi:
        return np.nan

    if "@" in formula:
        return createIR(formula.split("@")[-1])
        
    if "/" in formula or "-" in formula:
        components = re.split(r'[-/]', formula)
        summ = 0
        for component in components:
            summ += createIR(component)
        if uf.check_nan(summ):
            print(f"ERR calculating mean IR: '{formula}'")
        return summ/len(components)

    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition {formula}")
        return np.nan

    element_counts = fu.elfromcomp(composition)
    sum_atoms = 0
    res = []

    try:
        oxi_state = fu.OS(formula)
    except:
        banned_oxi += [formula]
        print(f"ERR creating OS {formula}")
        return np.nan

    for atom, _ in composition.items():
        oxi_state_int = int(oxi_state[str(atom)])
        try:
            if oxi_state_int in atom.ionic_radii:
                ir = atom.ionic_radii[oxi_state_int]
            else:
                ir = atom.ionic_radii[find_closest_number(list(atom.ionic_radii.keys()), oxi_state_int)]
            # print(atom, oxi_state[str(atom)], ir, element_counts[str(atom)])
        except Exception as e:
            print(e)
            print(f"ERR getting ionic radii {formula}; {atom}; {atom.ionic_radii.keys()}; {oxi_state[str(atom)]};")
            return np.nan
        sum_atoms += element_counts[str(atom)]
        res.append(element_counts[str(atom)] * ir)
        
    IR = sum(res)/sum_atoms
    return IR

In [202]:
res_df["IR"] = df["formula"].progress_apply(createIR)

  0%|          | 0/2200 [00:00<?, ?it/s]

min() iterable argument is empty
ERR getting ionic radii C6H8N2O2Se; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii C8H12N2O2Se; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii Cu(OH)2; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii Cu(OH)2; H; dict_keys([]); 1.0;
ERR creating composition 
ERR calculating mean IR: 'Se-'
min() iterable argument is empty
ERR getting ionic radii C7H9NO3Se; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii C7H9NO3Se; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii NH4F; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii NaC7H6NO4; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii NaC6H4NO3; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR getting ionic radii NaC6H4NO3; H; dict_keys([]); 1.0;
min() iterable argument is empty
ERR ge

## pot2

In [203]:
def createPot2(formula):
    global banned_oxi
    if uf.check_nan(formula) or formula in banned_oxi:
        return np.nan

    if "@" in formula:
        return createPot2(formula.split("@")[-1])
        
    if "/" in formula or "-" in formula:
        components = re.split(r'[-/]', formula)
        summ = 0
        for component in components:
            summ += createPot2(component)
        if uf.check_nan(summ):
            print(f"ERR calculating mean pot2: '{formula}'")
        return summ/len(components)

    try:
        intindex_comp = fu.intindex(formula)
    except:
        print(f"ERR creating intindex {formula}")
        return np.nan

    # print(intindex_comp)
    try:
        os_comp = fu.OS(intindex_comp)
    except:
        print(f"ERR creating os_comp {formula} {intindex_comp}")
        return np.nan

    atom_counts = fu.elfromcomp(fu.Composition(intindex_comp))
    redox_list = []
    for atom, oxi in os_comp.items():

        try:
            elem = fu.Element(atom)
        except:
            print(f"ERR creating element {formula} {atom}")
            return np.nan


        if elem.is_metal:
            # print('is me')
            if oxi <= 0:
                redox_list.append(0)
            else:
                redox_list.append(
                    fu.Redox(
                        atom, 
                        int(round(oxi))
                    ) * atom_counts[atom]
                )
    
    return sum(redox_list)/sum(atom_counts.values())

In [204]:
res_df["pot2"] = df["formula"].progress_apply(createPot2)

  0%|          | 0/2200 [00:00<?, ?it/s]

ERR creating intindex 
ERR calculating mean pot2: 'Se-'


## ph

In [205]:
res_df["ph"] = df["ph"]

## temp

In [206]:
res_df["temp"] = df["temp, °C"]

## dstr

In [207]:
def createDstr(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan

    sizes_list = [l, w, d]
    sizes_list.sort(reverse=True)
    l, w, d = sizes_list
    

    big_diff_lw = l/w>6
    big_diff_ld = l/d>6

    if big_diff_lw and big_diff_ld:
        dstr = 1
    elif big_diff_lw or big_diff_ld:
        dstr = 2
    else:
        dstr = 3
    return dstr

In [208]:
res_df["dstr"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createDstr, axis=1)

## C (Cmin, Cmax, lgCmin ...)

In [209]:
res_df["Cmin"] = df["C min, mM"]
res_df["Cmax"] = df["C max, mM"]
res_df["lgCmin"] = np.log10(res_df["Cmin"])
res_df["lgCmax"] = np.log10(res_df["Cmax"])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [210]:
res_df["Cconst"] = df["C(const), mM"]
res_df["Ccat"] = df["Ccat(mg/mL)"]
res_df["lgCconst"] = np.log10(res_df["Cconst"])
res_df["lgCcat"] = np.log10(res_df["Ccat"])

  result = getattr(ufunc, method)(*inputs, **kwargs)


## lgvolume

In [211]:
def createLgvolume(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan
    return np.log10(l*w*d)

In [212]:
res_df["lgvolume"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createLgvolume, axis=1)

## Km, Vmax

In [213]:
res_df["Km"] = df["Km, mM"]
res_df["Vmax"] = df["Vmax, mM/s"]

## activity

In [214]:
df["activity"].value_counts()

activity
peroxidase              1797
oxidase                  168
catalase                 109
superoxide dismutase      50
multi-activity            14
phosphatase               14
esterase                  10
reductase                  9
laccase                    6
glycosidase                4
uricase                    2
dnase                      1
rnase                      1
hydrolase                  1
phosphotriesterase         1
epoxydase                  1
Name: count, dtype: int64

In [215]:
res_df["activity"] = df["activity"]

## polymer, surfactant, reaction_type descriptors

In [216]:
history = {}
h_prop = {}
h_comp = {}

In [217]:
def get_gescriptors_by_names(names, use_history=True):
    '''
    names - List of names of chemicals
    '''

    if use_history and (names in history):
        return history[names]

    ### pubchem descriptors
    # mw = 0
    tpsa = 0
    comp = 0
    logp = 0
    for name in names:
        if name in h_prop:
            prop = h_prop[name]
        else:
            prop = fu.pcp.get_properties(
                ['MolecularWeight', 'XLogP','TPSA', 'Complexity'],
                fu.monomer(name), 
                'name'
            )
            h_prop[name] = prop
        
        if len(prop) < 1:
            # if molecue not found then set nan values
            print(f"ERR PCP not found get_properties {name}")
            # mw = np.nan
            tpsa = np.nan
            P = np.nan
            comp = np.nan
            break
        prop = prop[0]
            

        # polym = float(mcoat)*1000/float(mw) ## i don't have mcoat
        # mw += float(prop.get('MolecularWeight', np.nan))
        if len(names) == 1:
            logp = prop.get('XLogP', np.nan)
        tpsa += float(prop.get('TPSA', np.nan))
        comp = float(prop.get('Complexity', np.nan))

    # convert zeros to nan
    # mw = mw if mw else np.nan
    tpsa = tpsa if tpsa else np.nan
    logp = logp if logp else np.nan
    comp = comp if comp else np.nan

    desc_pubchem = {
        # "MolWt": mw,
        "XLogP": logp,
        "TPSA": tpsa,
        "Complexity": comp,
    }

    ### RDKit descriptors
    smiles_list = []
    for name in names:
        if name in h_comp:
            smiles_obj = h_comp[name]
        else:
            smiles_obj = fu.pcp.get_compounds(fu.monomer(name), 'name')
            h_comp[name] = smiles_obj

        if len(smiles_obj) < 1:
            print(f"ERR PCP not found get_compounds {name}")
            desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
            desc = {**desc_pubchem, **desc_rdkit}
            history[names] = desc
            return desc

        smiles_list.append(smiles_obj[0].isomeric_smiles)

    if len(smiles_list) < 1:
        desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
    else:
        mol = fu.Chem.MolFromSmiles(".".join(smiles_list))
        # print(smiles_list)
        allDescrs = fu.getMolDescriptors(mol)
        # print(allDescrs.keys())
        desc_rdkit = {
            "MolWt": allDescrs["MolWt"],
            "PEOE_VSA7": allDescrs["PEOE_VSA7"],
            "PEOE_VSA9": allDescrs["PEOE_VSA9"],
            "VSA_EState8": allDescrs["VSA_EState8"],
            "Kappa2": allDescrs["Kappa2"],
            "BalabanJ": allDescrs["BalabanJ"],
            "MinAbsEStateIndex": allDescrs["MinAbsEStateIndex"],
            "MinEStateIndex": allDescrs["MinEStateIndex"],
            "EState_VSA6": allDescrs["EState_VSA6"],
            "VSA_EState4": allDescrs["VSA_EState4"],
            "PEOE_VSA8": allDescrs["PEOE_VSA8"],
            "MinPartialCharge": allDescrs["MinPartialCharge"],
            "EState_VSA4": allDescrs["EState_VSA4"],
            "SMR_VSA7": allDescrs["SMR_VSA7"],
            "BCUT2D_CHGLO": allDescrs["BCUT2D_CHGLO"],
            "MaxEStateIndex": allDescrs["MaxEStateIndex"],
            "MaxPartialCharge": allDescrs["MaxPartialCharge"],
        }

    desc = {**desc_pubchem, **desc_rdkit}
    history[names] = desc
    return desc

### Check how well this work

In [218]:
final_df = pd.read_csv("data/existing_datasets/final_df.csv")
start_df = pd.read_csv("data/existing_datasets/start_df.csv")

In [219]:
def get_row_descriptors(row, chemicals_columns = ["surface", "pol", "surf"]):
    rd = {}
    
    ### descriptors of surface, pol, surf
    chemicals = [row[col] for col in chemicals_columns]
    chemicals = [ch for ch in chemicals if ch not in ["0", "nan", "naked", ""] and not uf.check_nan(ch)]
    chemicals = tuple(chemicals)
    # print(chemicals)

    chemicals_descriptors = get_gescriptors_by_names(chemicals)
    # remove descriptors that aren't calculated for (surface, pol, surf)
    chemicals_descriptors = {key: value for key, value in chemicals_descriptors.items() if key not in ["MaxEStateIndex", "MaxPartialCharge", "Complexity"]}
    # print(chemicals_descriptors)
    rd.update(chemicals_descriptors)

    ### descriptors of reaction_type (ex.: TMB + H2O2)
    rd[f"MinPartialCharge.1"] = np.nan
    rd[f"MaxPartialCharge.1"] = np.nan
    rd[f"Complexity1"] = np.nan
    for i in range(2):
        rd[f"TPSA{i+1}"] = np.nan
        rd[f"MaxEStateIndex.{i+1}"] = np.nan


    if uf.check_nan(row["ReactionType"]):
        return rd
        

    chems = row["ReactionType"].replace(" + ", "+").split("+")
    chems = [ch.lstrip().rstrip() for ch in chems if ch not in ["0", "nan", ""] and not uf.check_nan(ch)]
    if len(chems) > 2:
        print("A lot of elements in reaction: ", "id", row["ReactionType"], chems)
        return rd

    for i, chem in enumerate(chems):
        desc = get_gescriptors_by_names((chem,))
        rd[f"TPSA{i+1}"] = desc["TPSA"]
        rd[f"MaxEStateIndex.{i+1}"] = desc["MaxEStateIndex"]
        if i == 0:
            rd[f"Complexity{i+1}"] = desc["Complexity"]
            rd[f"MinPartialCharge.{i+1}"] = desc["MinPartialCharge"]
            rd[f"MaxPartialCharge.{i+1}"] = desc["MaxPartialCharge"]

    return rd

In [220]:
from collections import defaultdict

In [122]:
n_of_good_convertions = 0
n_of_errors_by_key = defaultdict(int)
n_of_errors_by_row_id = defaultdict(int)

for i in tqdm(range(1000)):
    row = start_df.loc[i]
    id_ = row["#"]
    
    row_descriptors = get_row_descriptors(row)

    data = final_df[final_df["id"] == id_][[key for key in row_descriptors.keys()]]
    if data.shape[0] < 1:
        continue
        
    for key in row_descriptors.keys():
        eps = 0.01
        if "TPSA" in key:
            eps = 3

        orig = data[key].tolist()[0]
        
        # orig != 0 and descriptor is nan
        if orig > eps and uf.check_nan(row_descriptors[key]):
            if key == "XLogP":
                continue
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

        if abs(orig - row_descriptors[key])>eps:
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

    if n_of_errors_by_row_id[id_] == 0:
        n_of_good_convertions += 1

print(n_of_good_convertions)

  0%|          | 0/1000 [00:00<?, ?it/s]

23 MinPartialCharge.1 -0.744035145 -0.34392573610777344
23 MaxPartialCharge.1 0.211204803 0.29406472784425514
23 Complexity1 0.0 910.0
23 TPSA1 883.0 209.0
23 MaxEStateIndex.1 11.3319983 11.447015664231106
26 MinPartialCharge.1 -0.744035145 -0.34392573610777344
26 MaxPartialCharge.1 0.211204803 0.29406472784425514
26 Complexity1 0.0 910.0
26 TPSA1 883.0 209.0
26 MaxEStateIndex.1 11.3319983 11.447015664231106
91 MinPartialCharge.1 -0.744035145 -0.34392573610777344
91 MaxPartialCharge.1 0.211204803 0.29406472784425514
91 Complexity1 0.0 910.0
91 TPSA1 883.0 209.0
91 MaxEStateIndex.1 11.3319983 11.447015664231106
97 MinPartialCharge.1 -0.744035145 -0.34392573610777344
97 MaxPartialCharge.1 0.211204803 0.29406472784425514
97 Complexity1 0.0 910.0
97 TPSA1 883.0 209.0
97 MaxEStateIndex.1 11.3319983 11.447015664231106
117 MinPartialCharge.1 -0.744035145 -0.34392573610777344
117 MaxPartialCharge.1 0.211204803 0.29406472784425514
117 Complexity1 0.0 910.0
117 TPSA1 883.0 209.0
117 MaxEStateInd

### apply to my data

In [222]:
new_rows = []
for row in tqdm(df.iloc, total=len(df)):
    new_rows.append(get_row_descriptors(row, ['pol', 'surf']))
new_df = pd.DataFrame(new_rows)

  0%|          | 0/2200 [00:00<?, ?it/s]

ERR PCP not found get_properties O2•−
ERR PCP not found get_compounds O2•−
A lot of elements in reaction:  id Ag+ + TMB+H2O2 ['Ag', 'TMB', 'H2O2']
ERR PCP not found get_properties O2−
ERR PCP not found get_compounds O2−
ERR PCP not found get_properties DSPE-PEG
ERR PCP not found get_compounds DSPE-PEG
ERR PCP not found get_properties DA
ERR PCP not found get_compounds DA
ERR PCP not found get_properties multi-monomer
ERR PCP not found get_compounds multi-monomer
ERR PCP not found get_properties O2^-
ERR PCP not found get_compounds O2^-
ERR PCP not found get_properties p-NP
ERR PCP not found get_compounds p-NP
ERR PCP not found get_properties MB
ERR PCP not found get_compounds MB
ERR PCP not found get_properties Mn2
ERR PCP not found get_compounds Mn2
ERR PCP not found get_properties TMP
ERR PCP not found get_compounds TMP
ERR PCP not found get_properties heparin
ERR PCP not found get_compounds heparin
ERR PCP not found get_properties O2•-
ERR PCP not found get_compounds O2•-
ERR PCP no

In [223]:
new_df

Unnamed: 0,XLogP,TPSA,MolWt,PEOE_VSA7,PEOE_VSA9,VSA_EState8,Kappa2,BalabanJ,MinAbsEStateIndex,MinEStateIndex,...,EState_VSA4,SMR_VSA7,BCUT2D_CHGLO,MinPartialCharge.1,MaxPartialCharge.1,Complexity1,TPSA1,MaxEStateIndex.1,TPSA2,MaxEStateIndex.2
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,-0.254557,-0.254557,,40.5,6.000000,52.0,5.992739
2,,,,,,,,,,,...,,,,-0.398299,0.037337,226.0,52.0,5.992739,40.5,6.000000
3,,,,,,,,,,,...,,,,-0.398299,0.037337,226.0,52.0,5.992739,40.5,6.000000
4,,,,,,,,,,,...,,,,-0.254557,-0.254557,,40.5,6.000000,52.0,5.992739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,-1.7,132.0,192.123,0.000000,12.841643,0.000000,3.853987,4.619312,1.144213,-2.738426,...,0.000000,0.0,-2.178395,-0.254557,-0.254557,,40.5,6.000000,52.0,5.992739
2196,,,,,,,,,,,...,,,,-0.254557,-0.254557,,40.5,6.000000,52.0,5.992739
2197,,,,,,,,,,,...,,,,-0.398299,0.037337,226.0,52.0,5.992739,40.5,6.000000
2198,,,,,,,,,,,...,,,,-0.254557,-0.254557,,40.5,6.000000,,


In [224]:
res_df

Unnamed: 0,X,IR,pot2,ph,temp,dstr,Cmin,Cmax,lgCmin,lgCmax,Cconst,Ccat,lgCconst,lgCcat,lgvolume,Km,Vmax,activity
0,2.550000,0.300000,0.000000,,,,,,,,,,,,,,,peroxidase
1,2.750000,1.114286,-0.015429,5,55.0,,0.02,0.0200,-1.69897,-1.698970,40.00,0.50,1.602060,-0.30103,,0.020,,peroxidase
2,2.750000,1.114286,-0.015429,5,55.0,,40.00,40.0000,1.60206,1.602060,0.02,0.50,-1.698970,-0.30103,,0.020,,peroxidase
3,2.675000,1.000000,-0.130000,3.5,50.0,,0.20,2.0000,-0.69897,0.301030,5.00,0.02,0.698970,-1.69897,,0.010,0.001,peroxidase
4,2.675000,1.000000,-0.130000,3.5,50.0,,0.20,50.0000,-0.69897,1.698970,0.60,0.02,-0.221849,-1.69897,,0.050,0.002,peroxidase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,2.637907,,,3.5,25.0,3.0,0.01,1.0000,-2.00000,0.000000,0.50,,-0.301030,,3.554074,0.010,0.001,peroxidase
2196,2.750000,1.114286,-0.015429,4.5,37.0,,0.00,600.0000,-inf,2.778151,,,,,,53.550,,peroxidase
2197,2.750000,1.114286,-0.015429,4.5,37.0,,0.00,0.8322,-inf,-0.079772,,,,,,0.213,,peroxidase
2198,2.750000,1.114286,-0.015429,4.5,37.0,,,,,,,0.10,,-1.00000,,,,catalase


In [225]:
final_columns = final_df.drop(columns=['id', 'cryst', 'Mcoat', 'Sufrace', 'polym', 'Complexity']).columns.tolist()
new_final_df = pd.concat([res_df, new_df], axis=1)[final_columns]
new_final_df.shape

(2200, 42)

In [226]:
new_final_df["id"] = pd.Series(np.arange(10000, 10000+len(new_final_df)))

In [227]:
new_final_df.to_csv("final_p2.2.1.csv", index=False)

In [228]:
start_columns = start_df.drop(
    columns=['#', 'type', 'Syngony', 'Sufrace', 'surface', 'Mw(coat), g/mol',
             'Subtype', 'Vsub_min(mL)', 'Vsub_max(mL)', 'Vsub_const(mL)',
             'Vcat(mL)', 'Ccat(mkM)', 'Vbuffer(mL)', 'Dstr', 'mX',
             'mROx', 'mCD', 'volume', 'Mr, g/mol', 'link', 'shape']
).columns.tolist()
new_start_df = df.rename(columns={"SurfaceArea, m^2/g": "SurfaceArea, m^2/g "})[start_columns]
new_start_df["id"] = pd.Series(np.arange(10000, 10000+len(new_final_df))+1)

In [229]:
new_start_df.to_csv("start_p2.2.1.csv", index=False)