In [None]:
import polyfingerprints as pfp
import pandas as pd
import re
from rdkit import Chem

In [None]:
# always put ([*]) as the first of the side chains
def find_side_chain_to_left(psmiles: str, star_pos: int) -> str:
    if not ")([*])" in psmiles:
        return psmiles
    stack = []
    side_chain = []
    r_star_pos = len(psmiles)-star_pos # reverse the position
    for pos, i in enumerate(reversed(psmiles)):
        if i == ")":
            stack.append(pos)
        if i == "(":
            if len(stack) == 1:
                side_chain.append((stack.pop(), pos))
                if psmiles[-pos-2] != ")": # when the side chain of the star is left we do not need to keep collecting other side chains
                    
                    for nr, side in enumerate(side_chain): # sort the side chain so the one with the star is first
                        if r_star_pos in range(side[0], side[1]+1):
                            side_chain = [side_chain[nr]] + side_chain[nr+1:] + side_chain[:nr]
                            break
                    else: # if the star is not in the side chain reset the collection and continue with next side chain
                        side_chain = []
                        continue
                    break
            else:
                stack.pop()
    
    end_chain = -min(min(side_chain))
    if end_chain == 0:
        end_chain = None
    
    ordered_side_chain_string = "".join([psmiles[-end-1:-start] if start != 0 else psmiles[-end-1:] for start, end in side_chain])
    # how print anything else beside the ordered side chains:
    if end_chain:
        return psmiles[:-max(max(side_chain))-1] + ordered_side_chain_string + psmiles[end_chain:]
    else:
        return psmiles[:-max(max(side_chain))-1] + ordered_side_chain_string
            

In [None]:
def harmonize_PSMILES(psmiles: str) -> str:
    # make sure all stars are enclosed in square brackets but not double enclosed
    corr_enclosed = psmiles.count("[*]")
    if corr_enclosed < 2:
        if corr_enclosed == 0:
            psmiles = psmiles.replace("*", "[*]")
        else:
            front_a = psmiles.find("*")
            if psmiles.find("[*]") > front_a: # in case the first star is un-enclosed
                psmiles = psmiles.replace("*", "[*]", 1)
            else:
                psmiles = psmiles.rpartition("*")[0] + "[*]" + psmiles.rpartition("*")[2]        

    # check if the asterisks are set as a side chain if they are not on the end and parenthesize them otherwise.
    second_star_q = 0
    for pos, symbol in enumerate(psmiles):
        if symbol == "*":
            if second_star_q == 0: # skip the first star
                second_star_q = 1
            else: # bracket the second star if necessary
                if pos == len(psmiles)-2: 
                    if psmiles[pos-2] == ")": # if the asterisk is at the end (before its square bracket) AND not on a side chain (a ")" before it) it does not need bracketing
                        psmiles = psmiles[:pos-1] + "([" + psmiles[pos] + "])"
                    
                else:
                    if psmiles[pos-2] != "(" and psmiles[pos+2] != ")":
                        psmiles = psmiles[:pos-1] + "([" + psmiles[pos] + "])" + psmiles[pos+2:]
                psmiles = find_side_chain_to_left(psmiles, pos)
                break
    return psmiles


In [None]:
def parse_p_to_explicit_smiles(psmiles: str) -> str:
    h_psmiles = harmonize_PSMILES(psmiles)

    # second create a mol object
    mol = Chem.MolFromSmiles(h_psmiles)
    if mol is None:
        print(f"{psmiles} harmonized to {h_psmiles} is not a valid PSMILES string for parsing to explicit SMILES")
        return ""

    # iterate over the atoms catching the position of the C Atoms following and preceding [*] and their valence
    star_index = []
    valence = []
    atomsymbols = []
    for atom in mol.GetAtoms():
        atomsymbol = atom.GetSymbol()
        # if atom.GetIsAromatic():
        #     atomsymbol = atomsymbol.lower()
        atomsymbols.append(atomsymbol)
        if atomsymbol == "*":
            star_index.append(atom.GetIdx())
        valence.append(atom.GetExplicitValence())

    if len(star_index) != 2:
        # raise a type error
        raise TypeError (f"{__name__} cannot handle (ladder) polymers with {len(star_index)} * in the PSMILES string!")
    
    c_index = [star_index[0]+1, star_index[1]-1] # cannot do that atom symbols are not in order
    
    # the current valence of the C atoms is the explicit minus the [*] connection/-1
    def valence_change(val):
        return 4 - val
    h_num = []
    for sy, val in zip(atomsymbols, valence):  # number of H atoms assuming C atoms
        if sy == "C":
            h_num.append(valence_change(val))
        else:
            h_num.append(0)

    # replace C atoms with the explicit hydrogen count and catch the special case of parenthesised like e.g.([*]) and parenthesise the dangling part of the smiles string to the end after that C atom like [*]CC([*])(C)C(=O)OC -> [CH2][C](C)(C(=O)OC) (attention to the bracket pair ending with the last symbol ")")

    atomsymbols = [_atom if _atom != "*" else r"\*" for _atom in atomsymbols] # escape the * for the regex


    # create a dict of atom index and index of it's symbol in the string
    atom_positions = [m.start() for m in re.finditer(r"("+'|'.join(atomsymbols)+")", h_psmiles.upper())]

    atom_string_map = {atom_symbol_nr:string_pos for atom_symbol_nr, string_pos in zip(range(len(atomsymbols)), atom_positions)}

    c_str_indexes = [atom_string_map[_c_index] for _c_index in c_index]

    c_str_h_num = {atom_string_map[_c_index]:h_num[_c_index] for _c_index in c_index}

    smiles_reconstruction = ""
    for idx, character in enumerate(h_psmiles):
        if idx in c_str_indexes:
            smiles_reconstruction += f"[CH{c_str_h_num[idx]}]" if c_str_h_num[idx] > 0 else f"[{character}]"
        else:
            smiles_reconstruction += character

    # throw warning if the star is not directly behind the monomer-connecting C atom
    if ")[*]" in smiles_reconstruction:
        print(f"{psmiles} has a star not directly behind the monomer-connecting C atom!")
    
    smiles_reconstruction = smiles_reconstruction.replace("([*])", "")
    smiles_reconstruction = smiles_reconstruction.replace("[*]", "")

    return smiles_reconstruction

for test_PSMILE in (["CCCCC(COC(=O)C(*)C*)CC", 
    # "*CC(c1c(Cl)cccc1)*", "[*]CC(C)([*])(C(=O)OC)", "[*]OC(CC)CC(=O)*", 
    #                  "*C(=O)CC(CCC)O*", "[*]CC([*])(C)(C(=O)OC)", "*OC(CCC(c1ccccc1))CC(=O)O*", "[*]CC([*])(C)(C(=O)OC)", "*CC([*])(C)(C#N)", "[*]CC([*])(c1ccccc1)","[*]CC[*](C)(C(=O)Oc1ccccc1)", "[*]C1=CC=C([*])(N1)"
                     ]):
    print(test_PSMILE + " parsing...")
    parsed_to_explicit = parse_p_to_explicit_smiles(test_PSMILE)
    print(pfp.test_polymer_smiles(parsed_to_explicit), parsed_to_explicit)


In [None]:
display(Chem.MolFromSmiles("[*]CC[*](C)(C(=O)OC)"))
display(Chem.MolFromSmiles("[*]CC([*])(C)(C(=O)OC)"))
display(Chem.MolFromSmiles("S"+"CCOC(=O)C(C*)*"*2+"S"))
display(Chem.MolFromSmiles("CCOC(=O)C(CCCOC(=O)C(CCCOC(=O)C(C*)))*"))
display(Chem.MolFromSmiles("S"+"CCOC(=O)C(*)C(*)"*2+"S"))

In [None]:
'''
I need to make a dict of of weird SMILES strings and their proposed "right" writing:
- all that do not have * in brackets while not being at the end of the string
'''
illegal_PSMILES = {
    # "[*]CC[*](C)(C(=O)OC)": "[*]CC([*])(C)(C(=O)OC)", # C with 5 Bonds
    # "* C=C*": "*C=C*",
    # "*OC(CCC(c1ccccc1))CC(=O)O*": "[*]CC[*](c1cc(Cl)ccc1)",
    # "*C=CC(=O)OC#N*": "[*]CC[*](C(=O)OC)(C#N)", # without double bond and terminal nitrile group instead of inward positioned
    # "*CC(=C)C#N*": "[*]CC([*])(C)(C#N)" ,
    # "CC(c1cc(Cl)cccc1)": "[*]CC([*])(c1cc(Cl)ccc1)",
    # "[*]CC[*](c1ccccc1)":"[*]CC([*])(c1ccccc1)",
    # "*CC(c1c(Cl)ccccc1)*":"*CC(c1c(Cl)cccc1)*",
    # 
    # "*COCCCCCC(C)=O*":"[*]COCCCCCC(=O)C[*]",
    # "*[NH2+]1C=CC=C1*":"[*]C1=CC=C([*])(N1)",
    # "*ClC=C*":"[*]C(Cl)C=C[*]",
    # "*CCF*":"[*]CC[*](F)",
    # 
    # "*[H]OC(CC)CC(O)=O*": "[*]OC(CC)CC(=O)*",
    # "*CC(=C)C(=O)OCC1CO1*": "*CC(=C)C(=O)OCCC([*])(O)",
    # "*CC(c1cc(Cl)cccc1)*": "*CC(c1cc(Cl)ccc1)*",
    # "*CC(c1ccc(Cl)ccc1)*": "*CC(c1ccc(Cl)cc1)*",
    # "*CC(c1ccc(CN)ccc1)*": "*CC(c1ccc(CN)cc1)*",
    # "*CC(C)(c1cccccc1)*": "*CC(C)(c1ccccc1)*",
    # "*CC(c1ccc(C)ccc1)*": "*CC(c1ccc(C)cc1)*",
    # "CCC(C)C*C(c1ccc(OC)ccc1)*": "CCC(C)C*C(c1ccc(OC)cc1)*",
    # "*CC(c1cccccc1)*": "*CC(c1ccccc1)*",
    
    "CCOC(=O)C(C*)*": "*CC(*)(C(=O)OCC)",
    "CCCCOC(=O)C(C*)*":"*CC(*)(C(=O)OCCCC)",
    # ToDo: clear the following
    "CCCCC(COC(=O)C(C*)*)CC":"CCCCC(CC)COC(=O)C(*)C*",
    "CCCCCCCCCOC(=O)C(C*)*":"CCCCCCCCCOC(=O)C(*)C*",
    "CCCCCCCCOC(=O)C(C*)*":"CCCCCCCCOC(=O)C(*)C*",
    "CCCCOC(=O)C(C*)(C)*":"CCCCOC(=O)C(*)(C)C*",
    "CCCCCCCCCCOC(=O)C(C*)(C)*":"CCCCCCCCCCOC(=O)C(*)(C)C*",
    "CCOC(=O)C(C*)(C)*":"CCOC(=O)C(*)(C)C*",
    "CCCCCCOC(=O)C(C*)(C)*":"CCCCCCOC(=O)C(*)(C)C*",
    "OCCOC(=O)C(C*)(C)*":"OCCOC(=O)C(*)(C)C*",
    "*C(C(=O)O)(C*)C":"*C(C(=O)O)(C)C*",
    "CCCCCCCCCCCCCCCCCCOC(=O)C(C*)(C)*":"CCCCCCCCCCCCCCCCCCOC(=O)C(C*)(C)*",
    "CCCOC(=O)C(C*)(C)*":"CCCOC(=O)C(C*)(C)*",
    # "":"",
    }


# Chem.MolFromSmiles("S"+"*OC(CC)CC(O)=C*"*1+"S")
# for v in illegal_PSMILES.values():
#     display(Chem.MolFromSmiles(v))

In [None]:
illsm = ["CCOC(=O)C(C*)*",
    "CCCCOC(=O)C(C*)*",
    "CCCCC(COC(=O)C(C*)*)CC",
    "CCCCCCCCCOC(=O)C(C*)*",
    "CCCCCCCCOC(=O)C(C*)*",
    "CCCCOC(=O)C(C*)(C)*",
    "CCCCCCCCCCOC(=O)C(C*)(C)*",
    "CCOC(=O)C(C*)(C)*",
    "CCCCCCOC(=O)C(C*)(C)*",
    "OCCOC(=O)C(C*)(C)*",
    "*C(C(=O)O)(C*)C",
    "CCCCCCCCCCCCCCCCCCOC(=O)C(C*)(C)*",
    "CCCOC(=O)C(C*)(C)*",]
[ills.replace("(C*)*", "(*)C*") for ills in illsm]

In [None]:
downloads_p = r""
homo_p = downloads_p + r"new_final_homo.csv"

copo_p = downloads_p + r"copo_final.csv"

In [None]:
all_FPs = {}
for dt_p in [homo_p, copo_p]:
    dt = pd.read_csv(dt_p)
    psmiles_columns = [column for column in dt.columns if "psmiles" in column.lower()]
    molfrac_columns = [column for column in dt.columns if "molpercent" in column.lower()]
    print(psmiles_columns, molfrac_columns)
    exp_column_list = []
    for psmiles_column in psmiles_columns:
        dt[psmiles_column+"to_explicit_SMILES"] = dt[psmiles_column].apply(
            lambda y: parse_p_to_explicit_smiles(illegal_PSMILES[y]) if y in illegal_PSMILES.keys() else parse_p_to_explicit_smiles(y))
        exp_column_list.append(psmiles_column+"to_explicit_SMILES")

    mono_molfrac_list_tuples = [(mon, molfrac) for mon, molfrac in zip(exp_column_list, molfrac_columns)]
    print(mono_molfrac_list_tuples)
    pfpdata = pfp.loader.df_loader(df=dt,
                         repeating_unit_columns=mono_molfrac_list_tuples,
                         mw_column="Mn",
                         additional_columns=["PDI"]
                         )
    all_FPs[dt_p] = pfpdata

len_pfp_sets = {key:len(values) for key, values in all_FPs.items()}
    
combined_pfps = [pfp for pfp_list in all_FPs.values() for pfp in pfp_list] # flatten the list
reduced_combined_pfps, mask = pfp.reduce_pfp_in_dataset(combined_pfps) # reduce the pfps and split them back into the dictionary
# split the reduced pfps back into the datasets

for dt_p in [homo_p, copo_p]:
    dt = pd.read_csv(dt_p)
    respective_pfp_set = reduced_combined_pfps[:len_pfp_sets[dt_p]]
    reduced_combined_pfps = reduced_combined_pfps[len_pfp_sets[dt_p]:]
    
    dt["pfp"] = [pfpdat["pfp"] for pfpdat in respective_pfp_set] 
    dt.to_csv(dt_p.replace(".csv", "_explicit_SMILES.csv"))


In [None]:
respective_pfp_set[1]

In [None]:
# updating Polyfingerprints library with fingerprint generators for atom pairs
from rdkit.Chem.AtomPairs import Pairs
old_ap = [_ for _ in
          Pairs.GetHashedAtomPairFingerprint(Chem.MolFromSmiles(parse_p_to_explicit_smiles(test_PSMILE)))]  # if _ > 0]

from rdkit.Chem.rdFingerprintGenerator import GetAtomPairGenerator
ap_fp_gen = GetAtomPairGenerator()
new_ap = [_ for _ in ap_fp_gen.GetFingerprint( Chem.MolFromSmiles(parse_p_to_explicit_smiles(test_PSMILE)) )]# if _ > 0]
print(len(old_ap), len(new_ap), "lengths")
print(old_ap.count(1), new_ap.count(1), "1s")
print(old_ap.count(2), new_ap.count(2), "2s")
print(old_ap.count(3), new_ap.count(3), "3s")
print([_ for _ in old_ap if _ > 3], [_ for _ in new_ap if _ > 3], ">others")