In [None]:
import pandas as pd
import re
from rdkit import Chem
import polyfingerprints as pfp
import warnings

In [None]:
# always put ([*]) as the first of the side chains
def find_side_chain_to_left(psmiles: str, star_pos: int) -> str:
    if not ")([*])" in psmiles:
        return psmiles
    stack = []
    side_chain = []
    r_star_pos = len(psmiles)-star_pos # reverse the position
    for pos, i in enumerate(reversed(psmiles)):
        if i == ")":
            stack.append(pos)
        if i == "(":
            if len(stack) == 1:
                side_chain.append((stack.pop(), pos))
                if psmiles[-pos-2] != ")": # when the side chain of the star is left we do not need to keep collecting other side chains
                    
                    for nr, side in enumerate(side_chain): # sort the side chain so the one with the star is first
                        if r_star_pos in range(side[0], side[1]+1):
                            side_chain = [side_chain[nr]] + side_chain[nr+1:] + side_chain[:nr]
                            break
                    else: # if the star is not in the side chain reset the collection and continue with next side chain
                        side_chain = []
                        continue
                    break
            else:
                stack.pop()
    
    end_chain = -min(min(side_chain))
    if end_chain == 0:
        end_chain = None
    
    ordered_side_chain_string = "".join([psmiles[-end-1:-start] if start != 0 else psmiles[-end-1:] for start, end in side_chain])
    # how print anything else beside the ordered side chains:
    if end_chain:
        return psmiles[:-max(max(side_chain))-1] + ordered_side_chain_string + psmiles[end_chain:]
    else:
        return psmiles[:-max(max(side_chain))-1] + ordered_side_chain_string
            

In [None]:
def harmonize_PSMILES(psmiles: str) -> str:
    # make sure all stars are enclosed in square brackets but not double enclosed
    corr_enclosed = psmiles.count("[*]")
    if corr_enclosed < 2:
        if corr_enclosed == 0:
            psmiles = psmiles.replace("*", "[*]")
        else:
            front_a = psmiles.find("*")
            if psmiles.find("[*]") > front_a: # in case the first star is un-enclosed
                psmiles = psmiles.replace("*", "[*]", 1)
            else:
                psmiles = psmiles.rpartition("*")[0] + "[*]" + psmiles.rpartition("*")[2]        

    # check if the asterisks are set as a side chain if they are not on the end and parenthesize them otherwise.
    second_star_q = 0
    for pos, symbol in enumerate(psmiles):
        if symbol == "*":
            if second_star_q == 0: # skip the first star
                second_star_q = 1
            else: # bracket the second star if necessary
                if pos == len(psmiles)-2: 
                    if psmiles[pos-2] == ")": # if the asterisk is at the end (before its square bracket) AND not on a side chain (a ")" before it) it does not need bracketing
                        psmiles = psmiles[:pos-1] + "([" + psmiles[pos] + "])"
                    
                else:
                    if psmiles[pos-2] != "(" and psmiles[pos+2] != ")":
                        psmiles = psmiles[:pos-1] + "([" + psmiles[pos] + "])" + psmiles[pos+2:]
                psmiles = find_side_chain_to_left(psmiles, pos)
                break
    return psmiles

In [None]:
def parse_p_to_explicit_smiles(psmiles: str) -> str:
    h_psmiles = harmonize_PSMILES(psmiles)

    # second create a mol object
    mol = Chem.MolFromSmiles(h_psmiles)
    if mol is None:
        print(f"{psmiles} harmonized to {h_psmiles} is not a valid PSMILES string for parsing to explicit SMILES")
        return ""

    # iterate over the atoms catching the position of the C Atoms following and preceding [*] and their valence
    star_index = []
    valence = []
    atomsymbols = []
    for atom in mol.GetAtoms():
        atomsymbol = atom.GetSymbol()
        # if atom.GetIsAromatic():
        #     atomsymbol = atomsymbol.lower()
        atomsymbols.append(atomsymbol)
        if atomsymbol == "*":
            star_index.append(atom.GetIdx())
        valence.append(atom.GetExplicitValence())

    if len(star_index) != 2:
        # raise a type error
        raise TypeError (f"{__name__} cannot handle (ladder) polymers with {len(star_index)} * in the PSMILES string!")
    
    c_index = [star_index[0]+1, star_index[1]-1] # cannot do that atom symbols are not in order
    
    # the current valence of the C atoms is the explicit minus the [*] connection/-1
    def valence_change(val):
        return 4 - val
    h_num = []
    for sy, val in zip(atomsymbols, valence):  # number of H atoms assuming C atoms
        if sy == "C":
            h_num.append(valence_change(val))
        else:
            h_num.append(0)

    # replace C atoms with the explicit hydrogen count and catch the special case of parenthesised like e.g.([*]) and parenthesise the dangling part of the smiles string to the end after that C atom like [*]CC([*])(C)C(=O)OC -> [CH2][C](C)(C(=O)OC) (attention to the bracket pair ending with the last symbol ")")

    atomsymbols = [_atom if _atom != "*" else r"\*" for _atom in atomsymbols] # escape the * for the regex

    # create a dict of atom index and index of it's symbol in the string
    atom_positions = [m.start() for m in re.finditer(r"("+'|'.join(atomsymbols)+")", h_psmiles.upper())]
    atom_string_map = {atom_symbol_nr:string_pos for atom_symbol_nr, string_pos in zip(range(len(atomsymbols)), atom_positions)}
    c_str_indexes = [atom_string_map[_c_index] for _c_index in c_index]
    c_str_h_num = {atom_string_map[_c_index]:h_num[_c_index] for _c_index in c_index} # 

    smiles_reconstruction = ""
    for idx, character in enumerate(h_psmiles):
        if idx in c_str_indexes:
            smiles_reconstruction += f"[CH{c_str_h_num[idx]}]" if c_str_h_num[idx] > 0 else f"[{character}]"
        else:
            smiles_reconstruction += character

    # throw warning if the star is not directly behind the monomer-connecting C atom
    if ")[*]" in smiles_reconstruction:
        print(f"{psmiles} has a star not directly behind the monomer-connecting C atom!")
    
    smiles_reconstruction = smiles_reconstruction.replace("([*])", "")
    smiles_reconstruction = smiles_reconstruction.replace("[*]", "")

    return smiles_reconstruction

# for test_PSMILE in (["[*]CC[*](C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C)", 
#                       "CCCCC(COC(=O)C(*)C*)CC", "*CC(c1c(Cl)cccc1)*", "[*]CC(C)([*])(C(=O)OC)", "[*]OC(CC)CC(=O)*", "*C(=O)CC(CCC)O*", "[*]CC([*])(C)(C(=O)OC)", "*OC(CCC(c1ccccc1))CC(=O)O*", "[*]CC([*])(C)(C(=O)OC)", "*CC([*])(C)(C#N)", "[*]CC([*])(c1ccccc1)","[*]CC[*](C)(C(=O)Oc1ccccc1)", "[*]C1=CC=C([*])(N1)"
#     ]):
#     print(test_PSMILE + " parsing...")
#     parsed_to_explicit = parse_p_to_explicit_smiles(test_PSMILE)
#     print(pfp.test_polymer_smiles(parsed_to_explicit), parsed_to_explicit)

In [None]:
# dict of weird PSMILES strings and their proposed "right" writing:
illegal_PSMILES = {
    "[*]CC[*](C)(C(=O)OC)": "[*]CC([*])(C)(C(=O)OC)",
    "* C=C*": "*CC*", # etylene
    "*C#C" : "[*]C=C[*]", # acetylene
    "*OC(CCC(c1ccccc1))CC(=O)O*": "[*]OC(CCC(c1ccccc1))CC(=O)C[*]", # dunno what to do with the source, the autors at https://pubs.acs.org/doi/10.1021/bm010018h do not make it very clear and propably have some mistake in their structure, but it should be some polyhydroxy propionate
    "*C=CC(=O)OC#N*": "[*]CC([*])(C(=O)OCCCC)(C#N)", # cyanoacrylate (superglue) !also added the butylcyanoacrylat bec it has a specific tg of 74°C 
    "*CC(=C)C#N*": "[*]CC([*])(C)(C#N)", # methacrylonitrile
    "CC(c1cc(Cl)cccc1)": "[*]CC([*])(c1cc(Cl)ccc1)",
    "[*]CC[*](c1ccccc1)":"[*]CC([*])(c1ccccc1)",
    "*CC(c1c(Cl)ccccc1)*":"*CC(c1c(Cl)cccc1)*",
    "*COCCCCCC(C)=O*":"[*]CCCCCC(=O)O[*]", # poly(e-caprolactone) The dispersity seems to be calculated in a wrong way Mn is 530 to 630,000 Mw is 10,000 to 200,000 the dispersity is proclaimed to be 1.08-1.53 instead I have no idea how they compute that instead of 18.868 to 0.317 (which obviously makes no sense)
    "*[NH2+]1C=CC=C1*":"[*]C1=CC=C([*])(N1)", # pyrrole
    "*ClC=C*":"[*]CC([*])(Cl)", # vinylchloride
    "*CCF*":"[*]CC[*](F)", # vinylfluoride
    "*[H]OC(CC)CC(O)=O*": "[*]OC(CC)CC([*])(=O)", # (3HV) 3-hydroxyvalerate
    "*CC(=C)C(=O)OCC1CO1*": "[*]CC([*])(C)(C(=O)OCC1OC1)", # glycidyl methacrylate
    "*CC(c1cc(Cl)cccc1)*": "*CC(c1cc(Cl)ccc1)*",
    "*CC(c1ccc(Cl)ccc1)*": "*CC(c1ccc(Cl)cc1)*",
    "*CC(c1ccc(CN)ccc1)*": "*CC(c1ccc(CN)cc1)*",
    "*CC(C)(c1cccccc1)*": "*CC(C)(c1ccccc1)*",
    "*CC(c1ccc(C)ccc1)*": "*CC(c1ccc(C)cc1)*",
    "CCC(C)C*C(c1ccc(OC)ccc1)*": "*C(C(C)CC)C(*)(c1ccc(OC)cc1)",
    "*CC(c1cccccc1)*": "*CC(c1ccccc1)*",
    
    # for larger sets with the following sample type canonicalizing the PSMILES is automatically possible with the canonicalize package 
    #     (the longest chain will be found by replacing the * with a long C chain and canonicalizing then before re-replacing)
    "CCOC(=O)C(C*)*": "*CC(*)(C(=O)OCC)",
    "CCCCOC(=O)C(C*)*":"*CC(*)(C(=O)OCCCC)",
    "CCCCC(COC(=O)C(C*)*)CC":"*CC(*)(C(=O)OCC(CC)CCCC)",
    "CCCCCCCCCOC(=O)C(C*)*":"*CC(*)(C(=O)OCCCCCCCCC)",
    "CCCCCCCCOC(=O)C(C*)*":"*CC(*)(C(=O)OCCCCCCCC)",
    "CCCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCCC)",
    "CCCCCCCCCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCCCCCCCCC)",
    "CCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCC)",
    "CCCCCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCCCCC)",
    "OCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCO)",
    "*C(C(=O)O)(C*)C":"*CC(*)(C)(C(=O)O)",
    "CCCCCCCCCCCCCCCCCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCCCCCCCCCCCCCCCCC)",
    "CCCOC(=O)C(C*)(C)*":"*CC(*)(C)(C(=O)OCCC)",
    }

# for k, v in illegal_PSMILES.items():
#     display(Chem.MolFromSmiles(v))

In [None]:

def convert_explicit_SMILES_to_PSMILES(explicit_SMILES: str, chain_pos: str = "monomer"):
    all_explicits = re.findall(r"\[[^\]\*]*\]", explicit_SMILES)
    replacements = [radical.replace("[", "").replace("]", "") for radical in all_explicits]
    replacements = [re.sub("H[0-9]?", "", radical) for radical in replacements]
    if len(all_explicits) == 0:
        warnings.warn(f"No explicits found in {explicit_SMILES}")
        return explicit_SMILES
    PSMILES = explicit_SMILES
    
    # if the replacement is not at the end or beginning it needs brackets
    end_q = [(PSMILES.find(expl) == 0 or PSMILES.rfind(expl) == (len(PSMILES)-len(expl))) for expl in all_explicits]
    
    first = True
    for edge, expl, repl in zip(end_q, all_explicits, replacements):
        if first:
            if edge:
                repl = "[*]" + repl
                first = False
            else:
                repl = repl + "([*])"
                first = False
            PSMILES = PSMILES.replace(expl, repl, 1)
        else:
            if edge:
                repl = repl + "[*]"
            else:
                repl = repl + "([*])"
            PSMILES = PSMILES.replace(expl, repl, 1)
            break
    
    match chain_pos:
        case "monomer":                   
            if len(all_explicits) != 2:
                warnings.warn("\n" + explicit_SMILES + ": " + "Too many OR little explicits found. Is there a radical in the monomer, or is this an end group?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")
        case "end":            
            if len(all_explicits) != 1:
                warnings.warn("\n" + explicit_SMILES + ": " + "None or too many explicits found. Is there a radical in the end group, or is it a monomer?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")

#     return PSMILES
# for test_exp_SMILES in ["[CH2][C](C)(C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C)", "[O]CCC[C](=O)","[O]CCC[C]", "[C]CCC[C]", "C([CH])CSC[CH2]"]:
#     print(test_exp_SMILES + " converting...")
#     print(convert_explicit_SMILES_to_PSMILES(test_exp_SMILES))

In [None]:
homo_p = r"new_final_homo.csv"
copo_p = r"copo_final.csv"

In [None]:
# parse PSMILES to Explicit and back to PSMILES for unification
for dt_p in [homo_p, copo_p]:
    dt = pd.read_csv(dt_p)
    psmiles_columns = [column for column in dt.columns if "psmiles" in column.lower()]

    for psmiles_column in psmiles_columns:
        dt[psmiles_column+"_reparsed"] = dt[psmiles_column].apply(
            lambda y: harmonize_PSMILES(
                convert_explicit_SMILES_to_PSMILES(
                parse_p_to_explicit_smiles(illegal_PSMILES[y]) if y in illegal_PSMILES.keys() else parse_p_to_explicit_smiles(y))))
    pd.DataFrame.to_csv(dt,"reparsed_PSMILES_" + dt_p, index=False)

In [None]:
all_FPs = {}
for dt_p in [homo_p, copo_p]:
    dt = pd.read_csv(dt_p)
    psmiles_columns = [column for column in dt.columns if "psmiles" in column.lower()]
    if not any(["molpercent" in column.lower() for column in dt.columns]):
        dt["molpercent_rep_u1"]=1
    molfrac_columns = [column for column in dt.columns if "molpercent" in column.lower()]
    print(psmiles_columns, molfrac_columns)
    exp_column_list = []
    for psmiles_column in psmiles_columns:
        dt[psmiles_column+"_to_explicit_SMILES"] = dt[psmiles_column].apply(
            lambda y: parse_p_to_explicit_smiles(illegal_PSMILES[y]) if y in illegal_PSMILES.keys() else parse_p_to_explicit_smiles(y))
        exp_column_list.append(psmiles_column+"_to_explicit_SMILES")
    
    
    mono_molfrac_list_tuples = [(mon, molfrac) for mon, molfrac in zip(exp_column_list, molfrac_columns)]
    pfpdata = pfp.loader.df_loader(df=dt,
                         repeating_unit_columns=mono_molfrac_list_tuples,
                         mw_column="Mn",
                         additional_columns=["PDI"]
                         )
    all_FPs[dt_p] = pfpdata
    
# out of the loop so reduction can take place over the combined dataset    
len_pfp_sets = {key:len(values) for key, values in all_FPs.items()}

combined_pfps = [pfp for pfp_list in all_FPs.values() for pfp in pfp_list] # flatten the list
reduced_combined_pfps, mask = pfp.reduce_pfp_in_dataset(combined_pfps) # reduce the pfps and split them back into the dictionary
# split the reduced pfps back into the datasets

new_concise_csv_names = {key: val for key, val in zip([homo_p, copo_p], ["homo-p_view_data.csv", "co-p_view_data.csv"])}
new_train_data_names = {key: val for key, val in zip([homo_p, copo_p], ["homo-p_train_data.pckl", "co-p_train_data.pckl"])}

for dt_p in [homo_p, copo_p]:
    dt = pd.read_csv("reparsed_PSMILES_" + dt_p)
    respective_pfp_set = reduced_combined_pfps[:len_pfp_sets[dt_p]]
    reduced_combined_pfps = reduced_combined_pfps[len_pfp_sets[dt_p]:]
    
    dt.reset_index(drop=True, inplace=True)
    dt["pfp"] = [pfpdat["pfp"] for pfpdat in respective_pfp_set]
    pd.to_pickle(dt, new_train_data_names[dt_p])
    dt.to_csv(new_concise_csv_names[dt_p], index=False)

In [None]:
testdf = pd.read_pickle("co-p_train_data.pckl")
testdf.reset_index(drop=True)