In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import functions.unifyer as uf
from tqdm.notebook import tqdm
tqdm.pandas()

In [19]:
from importlib import reload
reload(uf);

In [20]:
df = pd.read_csv("data/LLM_unif.csv") # dataset generated by LLM with units 

# Prepare LLM Dataset

## Rename synonyms

For example: 
```
'sheet-like': 'sheet',
'nanosheets': 'sheet',
'Nanosheets': 'sheet',
```

In [21]:
df["shape_unif"] = df["shape"].apply(uf.unify_shape)

In [22]:
df["activity_unif"] = df["activity"].apply(uf.unify_activity)

In [23]:
df["polymer_unif"] = df["polymer_used_in_synthesis"].apply(uf.unify_polymer)

In [24]:
df["surfactant_unif"] = df["surfactant"].apply(uf.unify_surfactant)

## Rename dataset columns according to 'start_df.csv'

In [25]:
usefull_columns_and_rename = {
    'formula': 'formula',
    # no type
    'activity_unif': 'activity',
    # no Syngony
    'shape_unif': 'shape',
    'length_mean': 'length, nm',
    'width_mean': 'width, nm',
    'depth_mean': 'depth, nm',
    'size_mean': 'size, nm', # only in my dataset
    # no Sufrace
    # no surface
    'polymer_unif': 'pol',
    'surfactant_unif': 'surf',
    # no Mw(coat), g/mol
    'km_unif': 'Km, mM',
    'vmax_unif': 'Vmax, mM/s',
    'reaction_type': 'ReactionType',
    # no Subtype
    'c_min_unif': 'C min, mM',
    # no Vsub_min(mL)
    'c_max_unif': 'C max, mM',
    # no Vsub_max(mL)
    'concentration_of_co_substrate_unif': 'C(const), mM',  #  C(const)
    # no Vsub_const(mL)
    'concentration_of_nanoparticles_unif': 'Ccat(mg/mL)', #  C(cat)
    # no Vcat(mL)
    # no Ccat(mkM)
    'p_h': 'ph',
    'temperature': 'temp, °C',
    # no 'Vbuffer(mL)', 'Dstr', 'mX', 'mROx', 'mCD', 'volume', 'Mr, g/mol'
    'zeta_potential_unif': 'Zpotential',
    'surface_area_unif': 'SurfaceArea, m^2/g',
}

In [26]:
df = df[list(usefull_columns_and_rename.keys())].rename(columns=usefull_columns_and_rename)

## Fill length, width, depth with sizse if Nan

In [27]:
cols_to_fill = ['length, nm', 'width, nm', 'depth, nm']
for col in cols_to_fill:
    df[col] = df[col].fillna(df['size, nm'])

## Remove temperature units

In [28]:
import re
def convert_temp(temp_str):
    
    if uf.check_nan(temp_str):
        return np.nan
    temp_str = temp_str.replace("~", "").lower().replace("_", " ")
    
    match = re.match(r"(\d+(?:\.\d+)?)\s*\-\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        mn, mx = match.groups()
        return (float(mn)+float(mx))/2

    match = re.match(r"(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    match = re.match(r"(\d+(?:\.\d+)?)\s*±\s*(\d+(?:\.\d+)?)\s*(?:°c|degrees celsius|c|degree celsius)", temp_str)
    if match:
        return float(match.groups()[0])

    print(temp_str)
    return np.nan
df["temp, °C"] = df["temp, °C"].apply(convert_temp)

298 k
298 k
298 k
298 k
298 k
298 k
298 k
298 k
298.15 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
298.1 k
300 k
300 k
300 k
300 k
298.15 k
293 k
293 k
293 k
293 k
295 k
295 k
293 k
293 k
293 k
293 k
293 k
5 min
5 min
298 k
298 k
303.1 k
303.1 k
ambient
303 k
303 k
ambient
298 k
278-328 k
310.00 k
310.00 k
20 - -30 °c
20 - -30 °c
25 degree c
25 degree c
298 k
298 k
298 k
298 k
310 k
4 degrees centigrade
4 degrees centigrade
8 k
8 k
ambient
295 k
300 k
300 k
300 k
298 k
298 k
298 k
298 k
298 k
298 k
25 degrees c
25 degrees c
307 k
298 k
298 k
298 k
50 # einsteins/m2/s
365 millidegree
83-88 f
85-88 f
79-90 f
85-88 f
79-90 f
79-90 f
363 k
298 k
298 k
303 k
303 k
303 k
310 k
298 k
310 k
310 k
310 k
298 k
298 k
298 k
343 k
305 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
293 k
room temperature
room temperature
room temperature
room temperature
300 k
300 k
298.15 k
298.15 k
298.15 k
298 k
298 k
298 k
298 k
140 f


# Make descriptors

In [29]:
import functions.functions_a as fu

In [16]:
reload(fu);

In [17]:
res_df = pd.DataFrame()

## X

In [30]:
def createX(formula):
    if uf.check_nan(formula):
        return np.nan
    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition {formula}")
        return np.nan

    X = composition.average_electroneg
    return X

In [41]:
df["formula"].apply(createX)

ERR creating composition N-CQDs
ERR creating composition N-CQDs
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition u-cytc550
ERR creating composition BSA-PtAu@CNS
ERR creating composition BSA-PtAu@CNS
ERR creating composition Cys-AuNCs
ERR creating composition CaO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition CaO2/DOX@SiO2/DOX-MnO2
ERR creating composition Fe3O4/Pβ-CD
ERR creating composition Fe3O4/Pβ-CD
ERR creating composition SeO32-
ERR creating composition (CyaSe)2
ERR creating composition 3-amino-9-ethylcarbazole
ERR creating composition o-tolidine
ERR creating composition FeIII–TMPyP
ERR creating composition AuNPs/MCA
ERR creating composition AuNPs/MCA
ERR creating composition Mn3O4/Pd@Pt
ERR creating composition Mn3O4/Pd@Pt
ERR creating composition P@Pt@P-Au
ERR creating compositi

  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms
  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms
  return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms


0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
9040   NaN
9041   NaN
9042   NaN
9043   NaN
9044   NaN
Name: formula, Length: 9045, dtype: float64

In [None]:
res_df["X"] = df["formula"].apply(createX)

In [31]:
st = pd.read_csv("./data/existing_datasets/start_df.csv")

In [32]:
fin = pd.read_csv("./data/existing_datasets/final_df.csv")

In [38]:
fin[fin["id"].isin(st[st["formula"].apply(createX).isna()]["#"])]

ERR creating composition (CeO2)0.10/CoO
ERR creating composition (CeO2)0.10/CoO
ERR creating composition CoO/CeO2
ERR creating composition CoO/CeO2
ERR creating composition Co3O4/CeO2
ERR creating composition Co3O4/CeO2
ERR creating composition Pd/CeO2
ERR creating composition Pd/CeO2
ERR creating composition Ce-Fe3O4
ERR creating composition Ce-Fe3O4
ERR creating composition Fe3O4-MnO2
ERR creating composition Fe3O4-MnO2
ERR creating composition Pt-WO2.72
ERR creating composition Pt-WO2.72
ERR creating composition Co3O4/CeO2
ERR creating composition Co3O4/CeO2
ERR creating composition Au-Cu2O
ERR creating composition Au-Cu2O
ERR creating composition Au-CeO2
ERR creating composition Au-CeO2
ERR creating composition CuO/Pt
ERR creating composition CuO/Pt
ERR creating composition Cu-CuFe2O4
ERR creating composition Cu-CuFe2O4
ERR creating composition Ru/C
ERR creating composition Ru/C
ERR creating composition Fe/CeO2
ERR creating composition Fe/CeO2
ERR creating composition TiO2/TiN
ERR 

Unnamed: 0,Km,Vmax,id,activity,X,IR,pot2,ph,temp,dstr,...,TPSA2,TPSA,XLogP,MaxEStateIndex.1,MaxEStateIndex.2,MinPartialCharge.1,MaxPartialCharge.1,BCUT2D_CHGLO,polym,Complexity
56,0.245,0.0001478,58,1,2.8696,1.1562,-0.2150,5.0,25.0,3,...,52.00,0.00,0.0,6.000000,5.992739,-0.254557,-0.254557,0.000000,0.000000,0.0
57,0.133,0.001101,59,1,2.8828,1.1208,-0.2150,5.0,25.0,3,...,40.50,0.00,0.0,5.992739,6.000000,-0.398299,0.037337,0.000000,0.000000,0.0
58,1.81,2.70E-05,60,1,2.8206,1.1762,-0.2150,3.6,25.0,1,...,40.50,0.00,0.0,5.992739,6.000000,-0.398299,0.037337,0.000000,0.000000,0.0
59,0.021,2.80E-05,61,1,2.7826,1.1878,-0.2150,3.6,25.0,1,...,52.00,0.00,0.0,6.000000,5.992739,-0.254557,-0.254557,0.000000,0.000000,0.0
60,0.36,0.0001666,62,1,2.7966,1.1608,-0.2900,6.0,30.0,1,...,40.50,20.31,0.4,5.992739,6.000000,-0.398299,0.037337,-2.048433,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1104,0.2,0.0001025,1175,2,2.7432,1.0962,0.3222,3.0,25.0,3,...,40.50,0.00,0.0,5.992739,0.000000,-0.398299,0.037337,0.000000,0.000000,0.0
1109,0.11,0.000244,1181,2,2.7452,1.1496,0.3222,4.0,25.0,1,...,40.50,0.00,0.0,5.992739,0.000000,-0.398299,0.037337,0.000000,0.000000,0.0
1110,0.356,0.64,1182,2,2.7486,1.1434,0.0784,7.4,28.0,3,...,47.40,0.00,0.0,10.532445,0.000000,-0.505047,0.377497,0.000000,0.000000,0.0
1111,0.189,0.907,1183,2,2.3700,1.1686,0.5208,7.4,35.0,3,...,50.38,158.82,-4.5,10.532445,0.000000,-0.505047,0.377497,-2.157158,1.000007,389.0


In [39]:
createX("CoO")

2.66

In [40]:
createX("SeO2")

3.143333333333333

## IR

In [None]:
banned_oxi = ['HRP', 'H10Q', 'H10A', 'GSH', 'GrdB', 'Cu 1.8S', 'K12[Ga4L6]', 'C14H24N2O2Se',
              'C30H24N2O2Se', 'C30H48N2O2Se', 'C16H24N2O2Se', 'FeMP', 'C15H21NOSe',
              'C16H21NOSe', 'C24H32N2O2Se2', 'C20H20N2O2Se2', 'K0.8Fe0.8Ti1.2O4', 'PTEBS',
              'C22H24N2O2Se2', '[Gly64]Mb', 'PS2.MA4', 'PS2.MA20', 'RBCm@Ru@MnO2', 'GNRs',
              'BSA', 'OsAPX', 'MPO', 'PANI', 'COX', 'Fe4[Fe(CN)6]3', 'POX 1B', 'C96H160O66N2Se2',
              'DNA50', 'C22H22Br2N2O10Se2', 'C22H24N2O6Se2', 'C18H22N2O6Se', 'C20H26N2O6Se', 'C20H20N2O6Se2',
              'PZ51', 'GAPDH', 'GQDs', 'C14H11NO3Se', 'C18H14OSe', 'C15H14O2Se', 'C3N4@NMF', 'C20H24N2O6Se2',
              'C34H28Fe2N2O2Se2', 'C30H36Fe2N2O2Se2', 'C38H38Fe2N2O2Se2', 'C26H28N2Se2', 'C12H6N4O8Se2',
              'P450CLA', 'C23H34N4O8SSe', 'Fe(III)Mb', 'G34C', 'A50C', 'DAB', 'GNR', 'STAP', 'C15H10O2',
              'C14H11NO2Se', 'PRDX1', 'K79A', 'K73A', 'K72AK73A', 'K72AK73AK79A', 'C13H9NOSe', 'PRX1',
              'Fe0.5Co0.5', 'TMB', 'TcGPXI', 'GFeF', 'CQDs', 'GO', 'C28H22O8', 'C29H24O8', 'C30H24O9',
              'C32H26O10', 'C19H24N3O3Se', 'C21H21N2O2Se', 'C20H31N4O', 'C16H19N2O2Se', 'C14H19N2O',
              'C27H33N4O2Se', 'C18H23N2O4Se', 'C16H21N2O2Se', 'EbMe', 'EbTe2', 'Ag2S@GO', 'GDYO', 'NADPH',
              'NADH', 'C12H14N4', 'Au140RS60', 'APX1', 'GGGTGGGAnGGGTGGG', 'GGGTGGGTGGGTGGG',
              'GGGTGGGTGGGTGGGAnGGGTGGGTGGG', 'GGGTGGGTGGGTGGGA7GGGTGGGTGGG', 'GST M208A', 'GST R15K',
              'C34H32N4O4FeCl', 'C15H10O7', 'C20H28N2O2Se2', 'PS2.M', 'Fe SAzyme', 'Zn SAzyme', 'C20H14N2O3Se',
              'HRP 5000', 'FeAC', 'Au 9.8Pd 90.2', 'CuI 4L 4', 'Ce0.5Zr0.5O2', 'CeGONRs', 'Fe3[Fe(CN)6]2', 'RNP',
              '[H2N(CH2)3]8Si8Co6O12(OH)4', 'HSA', 'MnP S168W', 'DMOP', 'CAPOA1', 'CAPOT1', 'CAPO1', 'C19H14BrN3O2Se',
              'C18H19BrN2O2Se', 'C17H17BrN2O2Se', 'C12H13O2.50Se', 'L29H Mb', 'Mn0.15Fe0.85Fe2O4', 'Mn0.5Fe0.5Fe2O4',
              'GHZ', 'CYP25', 'CuNiAl LDH', 'SAA', 'C20H29NO2SSe', 'PtCo@G', 'ATP A2', 'HRP A2', 'BiSA@Au', 'T67R Mb',
              'T67K Mb', 'BSA@AuNCs', 'PZ 51', 'TMPD', 'LPO', 'LspPOX', 'Fe(III)HRP', 'Mn(III)HRP', 'FeTBAP', 'C9H11NOSe',
              'C11H16NOSe', 'SAG', 'C8H18O2Se2', 'AF', 'W96A', 'F405L', 'F393A', 'M237V',
              'KFe(III)[Fe(CN) 6]Fe 2(III)(OH)(DMF)O(BDC) 3', 'C12H10O2Se2', 'C12H10O3Se2', 'D153A', 'N246A',
              'C13H8Cl2N2O2Se', 'C13H8Br2N2O2Se', 'C13H9Br3N2O2Se', 'C14H14O3Se', 'C18H12N2O2', 'FCG', 'AtGLB1',
              'AtGLB2', 'AtGLB3', 'TSA', 'AhpC', 'LmPP', 'V90H', 'Cu 0.3Co 2.7O 4', 'C1H0.677O0.586N0.015Na0.069',
              'C34H32ClFeN4O4', 'ESeSAr', 'C26H20N2O2Se', 'C26H20N2O2Te', 'C27H20Cl2N2O2Se', 'C28H18N2O2.5Te',
              'C26H18N2O2Te', 'BAFA', 'MbFeIII', 'C14H18N2Se2', 'C22H26N2Se2', 'C14H17NSe', 'C18H21NSe', 'ETP',
              'WT Mb', 'Fe 1.44O 0.32 (OH) 3.86', 'C34H32FeN4O4', 'ABTS', 'PdCu@HRP', 'C62(COOH)4', 'AnnAt1',
              'C60[C(COOH)2]2', 'Y103F Mb', 'VO(DAC)2', 'D138V', 'C16H18O4Se', 'C18H22O6Se', 'C20H26O8Se', 'CAT',
              'CNA', 'FePPIX', 'C26H40N2Se2', 'C16H20N2Se2', 'C18H24N2Se2', 'C20H28N2Se2', 'H39Q', 'GSeSeG',
              'C18H15N2O11Se2', 'C18H16N2O8Se', 'C18H15N2O8Se2', 'DOPA', 'C44H81O10NP', 'CYP119A1', 'BMP',
              'CYP152A1', 'CYP101A1', 'Cu 3.2(PO 4) 2@GOD', 'Cu 0.89Zn 0.11O', 'EPO', 'CPZ', 'CCP(M1)', 'W51F',
              'W191F', 'HRPC', 'Fe3O4@MOF', 'GIM', 'DMP', 'C15H10O5', 'C15H10O6', 'FeCPNGs', 'BSA@Au', 'Pd@PEDOT',
              'H404A', 'D292A', 'H496A', 'R490A', 'R360A', 'K353A', 'Au127SR51', 'Au127SR65', 'OsGrx', 'B(C6F5)3',
              'C25H14O7', '(T 3G 2) 4', 'T 2G 3', 'H48Q', 'Ce61Zr39O4(OH)4(BDC)6', 'Ce28Zr72O4(OH)4(BDC)6',
              'Ce43Zr57O4(OH)4(BDC)6', 'Ce87Zr13O4(OH)4(BDC)6', 'MMb', 'C282H432N80O80S4Fe', 'C4H10N130Te', 'C5H12N130Te',
              'CuC19H14N4O2', 'PGP', 'PMNT', 'MnTBAP', 'AuMS', 'C100H180N20O24', 'DMSN@AuPtCo', 'CYP119', 'GCDSA', 'MTP',
              'C20H10F2O5', 'Ni(C30H28N4O4S2)', 'Ni(C31H30N4O4S2)', 'AuAMP', 'C18H20N2O2Se2', 'C22H28N2O2Se2', 'C26H36N2O2Se2',
              'C26H40N2O2Se2', 'PMGO', 'CYP175A1', 'L80H', 'L80Q', 'Cu2Ag0.53Fe0.465(CN)6', 'Pd12L6', 'BM3', 'HRPR', 'GFH',
              'GPX1', 'WPTP E254G', 'BPQDs', 'Co1.5Mn1.5O4', 'Ru@G', 'PEDOT', 'ZPOA', 'HRPA', 'DMA', 'DNAzyme']

In [None]:
def createIR(formula):
    global banned_oxi
    if uf.check_nan(formula) or formula in banned_oxi:
        return np.nan

    try:
        composition = fu.Composition(formula)
    except:
        print(f"ERR creating composition {formula}")
        return np.nan

    element_counts = fu.elfromcomp(composition)
    sum_atoms = 0
    res = []

    try:
        oxi_state = fu.OS(formula)
    except:
        banned_oxi += [formula]
        print(f"ERR creating OS {formula}")
        return np.nan

    for atom, _ in composition.items():
        try:
            ir = atom.ionic_radii[int(oxi_state[str(atom)])]
            # print(atom, oxi_state[str(atom)], ir, element_counts[str(atom)])
        except:
            print(f"ERR getting ionic radii {formula}; {atom};")#{atom.ionic_radii}; {oxi_state[str(atom)]};")
            return np.nan
        sum_atoms += element_counts[str(atom)]
        res.append(element_counts[str(atom)] * ir)
        
    IR = sum(res)/sum_atoms
    return IR

In [None]:
res_df["IR"] = df["formula"].apply(createIR)

## pot2

In [None]:
def createPot2(formula):
    global banned_oxi
    if uf.check_nan(formula) or formula in banned_oxi:
        return np.nan

    try:
        intindex_comp = fu.intindex(formula)
    except:
        print(f"ERR creating intindex {formula}")
        return np.nan

    # print(intindex_comp)
    try:
        os_comp = fu.OS(intindex_comp)
    except:
        print(f"ERR creating os_comp {formula} {intindex_comp}")
        return np.nan

    atom_counts = fu.elfromcomp(fu.Composition(intindex_comp))
    redox_list = []
    for atom, oxi in os_comp.items():

        try:
            elem = fu.Element(atom)
        except:
            print(f"ERR creating element {formula} {atom}")
            return np.nan


        if elem.is_metal:
            # print('is me')
            if oxi <= 0:
                redox_list.append(0)
            else:
                redox_list.append(
                    fu.Redox(
                        atom, 
                        int(round(oxi))
                    ) * atom_counts[atom]
                )
    
    return sum(redox_list)/sum(atom_counts.values())

In [None]:
res_df["pot2"] = df["formula"].progress_apply(createPot2)

## ph

In [None]:
res_df["ph"] = df["ph"]

## temp

In [None]:
res_df["temp"] = df["temp, °C"]

## dstr

In [None]:
def createDstr(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan

    sizes_list = [l, w, d]
    sizes_list.sort(reverse=True)
    l, w, d = sizes_list
    

    big_diff_lw = l/w>6
    big_diff_ld = l/d>6

    if big_diff_lw and big_diff_ld:
        dstr = 1
    elif big_diff_lw or big_diff_ld:
        dstr = 2
    else:
        dstr = 3
    return dstr

In [None]:
res_df["dstr"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createDstr, axis=1)

## C (Cmin, Cmax, lgCmin ...)

In [None]:
res_df["Cmin"] = df["C min, mM"]
res_df["Cmax"] = df["C max, mM"]
res_df["lgCmin"] = np.log10(res_df["Cmin"])
res_df["lgCmax"] = np.log10(res_df["Cmax"])

In [None]:
res_df["Cconst"] = df["C(const), mM"]
res_df["Ccat"] = df["Ccat(mg/mL)"]
res_df["lgCconst"] = np.log10(res_df["Cconst"])
res_df["lgCcat"] = np.log10(res_df["Ccat"])

## lgvolume

In [None]:
def createLgvolume(sizes):
    l, w, d = sizes
    if uf.check_nan(l) or uf.check_nan(w) or uf.check_nan(d):
        return np.nan
    return np.log10(l*w*d)

In [None]:
res_df["lgvolume"] = df[["length, nm", "width, nm", "depth, nm"]].apply(createLgvolume, axis=1)

## Km, Vmax

In [None]:
res_df["Km"] = df["Km, mM"]
res_df["Vmax"] = df["Vmax, mM/s"]

## activity

In [None]:
res_df["activity"] = df["activity"].apply(uf.map_activity)

## polymer, surfactant, reaction_type descriptors

In [None]:
history = {}
h_prop = {}
h_comp = {}

In [None]:
def get_gescriptors_by_names(names, use_history=True):
    '''
    names - List of names of chemicals
    '''

    if use_history and (names in history):
        return history[names]

    ### pubchem descriptors
    # mw = 0
    tpsa = 0
    comp = 0
    logp = 0
    for name in names:
        if name in h_prop:
            prop = h_prop[name]
        else:
            prop = fu.pcp.get_properties(
                ['MolecularWeight', 'XLogP','TPSA', 'Complexity'],
                fu.monomer(name), 
                'name'
            )
            h_prop[name] = prop
        
        if len(prop) < 1:
            # if molecue not found then set nan values
            print(f"ERR PCP not found get_properties {name}")
            # mw = np.nan
            tpsa = np.nan
            P = np.nan
            comp = np.nan
            break
        prop = prop[0]
            

        # polym = float(mcoat)*1000/float(mw) ## i don't have mcoat
        # mw += float(prop.get('MolecularWeight', np.nan))
        if len(names) == 1:
            logp = prop.get('XLogP', np.nan)
        tpsa += float(prop.get('TPSA', np.nan))
        comp = float(prop.get('Complexity', np.nan))

    # convert zeros to nan
    # mw = mw if mw else np.nan
    tpsa = tpsa if tpsa else np.nan
    logp = logp if logp else np.nan
    comp = comp if comp else np.nan

    desc_pubchem = {
        # "MolWt": mw,
        "XLogP": logp,
        "TPSA": tpsa,
        "Complexity": comp,
    }

    ### RDKit descriptors
    smiles_list = []
    for name in names:
        if name in h_comp:
            smiles_obj = h_comp[name]
        else:
            smiles_obj = fu.pcp.get_compounds(fu.monomer(name), 'name')
            h_comp[name] = smiles_obj

        if len(smiles_obj) < 1:
            print(f"ERR PCP not found get_compounds {name}")
            desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
            desc = {**desc_pubchem, **desc_rdkit}
            history[names] = desc
            return desc

        smiles_list.append(smiles_obj[0].isomeric_smiles)

    if len(smiles_list) < 1:
        desc_rdkit = {
                "MolWt": np.nan,
                "PEOE_VSA7": np.nan,
                "PEOE_VSA9": np.nan,
                "VSA_EState8": np.nan,
                "Kappa2": np.nan,
                "BalabanJ": np.nan,
                "MinAbsEStateIndex": np.nan,
                "MinEStateIndex": np.nan,
                "EState_VSA6": np.nan,
                "VSA_EState4": np.nan,
                "PEOE_VSA8": np.nan,
                "MinPartialCharge": np.nan,
                "EState_VSA4": np.nan,
                "SMR_VSA7": np.nan,
                "BCUT2D_CHGLO": np.nan,
                "MaxEStateIndex": np.nan,
                "MaxPartialCharge": np.nan,
            }
    else:
        mol = fu.Chem.MolFromSmiles(".".join(smiles_list))
        # print(smiles_list)
        allDescrs = fu.getMolDescriptors(mol)
        # print(allDescrs.keys())
        desc_rdkit = {
            "MolWt": allDescrs["MolWt"],
            "PEOE_VSA7": allDescrs["PEOE_VSA7"],
            "PEOE_VSA9": allDescrs["PEOE_VSA9"],
            "VSA_EState8": allDescrs["VSA_EState8"],
            "Kappa2": allDescrs["Kappa2"],
            "BalabanJ": allDescrs["BalabanJ"],
            "MinAbsEStateIndex": allDescrs["MinAbsEStateIndex"],
            "MinEStateIndex": allDescrs["MinEStateIndex"],
            "EState_VSA6": allDescrs["EState_VSA6"],
            "VSA_EState4": allDescrs["VSA_EState4"],
            "PEOE_VSA8": allDescrs["PEOE_VSA8"],
            "MinPartialCharge": allDescrs["MinPartialCharge"],
            "EState_VSA4": allDescrs["EState_VSA4"],
            "SMR_VSA7": allDescrs["SMR_VSA7"],
            "BCUT2D_CHGLO": allDescrs["BCUT2D_CHGLO"],
            "MaxEStateIndex": allDescrs["MaxEStateIndex"],
            "MaxPartialCharge": allDescrs["MaxPartialCharge"],
        }

    desc = {**desc_pubchem, **desc_rdkit}
    history[names] = desc
    return desc

### Check how well this work

In [None]:
final_df = pd.read_csv("data/existing_datasets/final_df.csv")
start_df = pd.read_csv("data/existing_datasets/start_df.csv")

In [None]:
def get_row_descriptors(row, chemicals_columns = ["surface", "pol", "surf"]):
    rd = {}
    
    ### descriptors of surface, pol, surf
    chemicals = [row[col] for col in chemicals_columns]
    chemicals = [ch for ch in chemicals if ch not in ["0", "nan", "naked"] and not uf.check_nan(ch)]
    chemicals = tuple(chemicals)
    # print(chemicals)

    chemicals_descriptors = get_gescriptors_by_names(chemicals)
    # remove descriptors that aren't calculated for (surface, pol, surf)
    chemicals_descriptors = {key: value for key, value in chemicals_descriptors.items() if key not in ["MaxEStateIndex", "MaxPartialCharge", "Complexity"]}
    # print(chemicals_descriptors)
    rd.update(chemicals_descriptors)

    ### descriptors of reaction_type (ex.: TMB + H2O2)
    rd[f"MinPartialCharge.1"] = np.nan
    rd[f"MaxPartialCharge.1"] = np.nan
    rd[f"Complexity1"] = np.nan
    for i in range(2):
        rd[f"TPSA{i+1}"] = np.nan
        rd[f"MaxEStateIndex.{i+1}"] = np.nan


    if uf.check_nan(row["ReactionType"]):
        return rd
        

    chems = row["ReactionType"].replace(" + ", "+").split("+")
    chems = [ch.lstrip().rstrip() for ch in chems if ch not in ["0", "nan", ""] and not uf.check_nan(ch)]
    if len(chems) > 2:
        print("A lot of elements in reaction: ", "id", row["ReactionType"], chems)
        return rd

    for i, chem in enumerate(chems):
        desc = get_gescriptors_by_names((chem,))
        rd[f"TPSA{i+1}"] = desc["TPSA"]
        rd[f"MaxEStateIndex.{i+1}"] = desc["MaxEStateIndex"]
        if i == 0:
            rd[f"Complexity{i+1}"] = desc["Complexity"]
            rd[f"MinPartialCharge.{i+1}"] = desc["MinPartialCharge"]
            rd[f"MaxPartialCharge.{i+1}"] = desc["MaxPartialCharge"]

    return rd

In [None]:
from collections import defaultdict

In [None]:
n_of_good_convertions = 0
n_of_errors_by_key = defaultdict(int)
n_of_errors_by_row_id = defaultdict(int)

for i in tqdm(range(1000)):
    row = start_df.loc[i]
    id_ = row["#"]
    
    row_descriptors = get_row_descriptors(row)

    data = final_df[final_df["id"] == id_][[key for key in row_descriptors.keys()]]
    if data.shape[0] < 1:
        continue
        
    for key in row_descriptors.keys():
        eps = 0.01
        if "TPSA" in key:
            eps = 3

        orig = data[key].tolist()[0]
        
        # orig != 0 and descriptor is nan
        if orig > eps and uf.check_nan(row_descriptors[key]):
            if key == "XLogP":
                continue
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

        if abs(orig - row_descriptors[key])>eps:
            n_of_errors_by_key[key] += 1
            n_of_errors_by_row_id[id_] += 1
            print(id_, key, orig, row_descriptors[key])

    if n_of_errors_by_row_id[id_] == 0:
        n_of_good_convertions += 1

print(n_of_good_convertions)

### apply to my data

In [None]:
new_rows = []
for row in tqdm(df.iloc, total=len(df)):
    new_rows.append(get_row_descriptors(row, ['pol', 'surf']))
new_df = pd.DataFrame(new_rows)

In [None]:
new_df

In [None]:
final_columns = final_df.drop(columns=['id', 'cryst', 'Mcoat', 'Sufrace', 'polym', 'Complexity']).columns.tolist()
new_final_df = pd.concat([res_df, new_df], axis=1)[final_columns]

In [None]:
new_final_df["id"] = pd.Series(np.arange(10000, 10000+len(new_final_df))+1)

In [None]:
new_final_df.to_csv("final_p2.csv", index=False)

In [None]:
start_columns = start_df.drop(
    columns=['#', 'type', 'Syngony', 'Sufrace', 'surface', 'Mw(coat), g/mol',
             'Subtype', 'Vsub_min(mL)', 'Vsub_max(mL)', 'Vsub_const(mL)',
             'Vcat(mL)', 'Ccat(mkM)', 'Vbuffer(mL)', 'Dstr', 'mX',
             'mROx', 'mCD', 'volume', 'Mr, g/mol', 'link']
).columns.tolist()
new_start_df = df.rename(columns={"SurfaceArea, m^2/g": "SurfaceArea, m^2/g "})[start_columns]
new_start_df["id"] = pd.Series(np.arange(10000, 10000+len(new_final_df))+1)

In [None]:
new_start_df.to_csv("start_p2.csv", index=False)