In [None]:
# Explanatory notebook to convert this database with specific (radical) repetition units and end groups to the PSMILES of the Ramprasad-Group
import os
import pandas as pd
import re
import warnings

In [None]:
RAW_CSV_PATH = "./cloud_points_data.csv"
OUTDIR = "out"
if not os.path.exists(OUTDIR):
    os.makedirs(OUTDIR)
raw_df = pd.read_csv(RAW_CSV_PATH, sep=";", decimal=",")
raw_df

In [None]:
# transform every monomers  "[atoms and numbers]" string-part to "*atoms and numbers" e.g.:
# [CH2][CH](C(=O)NC1CCCCCC1) -> [*]CC[*](C(=O)NC1CCCCCC1)
# therefore all "[content]" must be searched then
    # the first must be replaced with "[*]content-H and following numbers"
    # the second must be replaced with "content-h and following numbers[*]"
def convert_explicit_SMILES_to_PSMILES(explicit_SMILES: str, chain_pos: str = "monomer"):
    all_radicals = re.findall(r"\[[^\]]*\]", explicit_SMILES)
    replacements = [radical.replace("[", "").replace("]", "") for radical in all_radicals]
    replacements = [re.sub("H[0-9]?", "", radical) for radical in replacements]
    match chain_pos:
        case "monomer":
            if len(all_radicals) != 2:
                warnings.warn(explicit_SMILES + ": " + "Too many OR little explicits found. Is there a radical in the polymer, or is this a end group?" + "\n" +
                              "are " + str(replacements[:2]) + " the right replacements?")
            replacements[0] = "[*]" + replacements[0]
            replacements[1] = replacements[1] + "[*]"
        case "end":
            if len(all_radicals) != 1:
                warnings.warn(explicit_SMILES + ": " + "None or too many explicits found. Is there a radical in the polymer, or is it a monomer?" + "\n" +
                              "is " + str(replacements[0]) + " the right replacement?")
            replacements[0] = "[*]" + replacements[0]
    PSMILES = explicit_SMILES
    for i in range(len(all_radicals)):
        PSMILES = PSMILES.replace(all_radicals[i], replacements[i])
    return PSMILES
convert_explicit_SMILES_to_PSMILES("[CH2][CH](C(=O)NC1CCCCCC1)",)

In [None]:
raw_df["SMILES_repeating_unitA"].apply(lambda x: convert_explicit_SMILES_to_PSMILES(x, "monomer"))