In [None]:
# Explanatory notebook to convert this database with specific (radical) repetition units and end groups to the PSMILES of the Ramprasad-Group
import pandas as pd
import re
import warnings

In [None]:
RAW_CSV_PATH = "./glass_transition_temperature_data.csv"

raw_df = pd.read_csv(RAW_CSV_PATH, sep=";", decimal=",")
raw_df

In [None]:
# transform every monomer "[atoms and numbers]" string-part to "*atoms and numbers" e.g.:
# [CH2][CH](C(=O)NC1CCCCCC1) -> [*]CC[*](C(=O)NC1CCCCCC1)
# therefore all "[content]" must be searched then
    # the first must be replaced with "[*]content-H and following numbers"
    # the second must be replaced with "content-h and following numbers[*]"
def convert_explicit_SMILES_to_PSMILES(explicit_SMILES: str, chain_pos: str = "monomer"):
    all_explicits = re.findall(r"\[[^\]\*]*\]", explicit_SMILES)
    replacements = [radical.replace("[", "").replace("]", "") for radical in all_explicits]
    replacements = [re.sub("H[0-9]?", "", radical) for radical in replacements]
    if len(all_explicits) == 0:
        return explicit_SMILES
    PSMILES = explicit_SMILES
    match chain_pos:
        case "monomer":
            replacements[0] = "[*]" + replacements[0]
            replacements[1] = replacements[1] + "[*]"
            PSMILES = PSMILES.replace(all_explicits[0], replacements[0], 1)
            PSMILES = PSMILES.replace(all_explicits[1], replacements[1], 1)
            if len(all_explicits) != 2:
                warnings.warn("\n" + explicit_SMILES + ": " + "Too many OR little explicits found. Is there a radical in the monomer, or is this an end group?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")
        case "end":
            replacements[0] = "[*]" + replacements[0]
            PSMILES = PSMILES.replace(all_explicits[0], replacements[0], 1)
            if len(all_explicits) != 1:
                warnings.warn("\n" + explicit_SMILES + ": " + "None or too many explicits found. Is there a radical in the end group, or is it a monomer?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")

    return PSMILES
convert_explicit_SMILES_to_PSMILES("[CH2][C](C)(C(=O)OCC[N+](C)(C)CCCS(=O)(=O)[O-]","monomer")

In [None]:
# get all headers with SMILES and convert them to PSMILES, also split between SMILES + repeating and none
# end_group_headers = [header for header in raw_df.columns if "end" and "group" in header]
end_group_headers = ["SMILES_start", "SMILES_end"]
# monomer_headers = [header for header in raw_df.columns if "SMILES_repeating" in header]
monomer_headers = [header for header in raw_df.columns if "SMILES_rep" in header]
print(end_group_headers, monomer_headers)
for end_group in end_group_headers:
    raw_df[end_group] = raw_df[end_group].apply(lambda x: convert_explicit_SMILES_to_PSMILES(x, "end") if type(x) == str else x)
for monomer in monomer_headers:
    raw_df[monomer] = raw_df[monomer].apply(lambda x: convert_explicit_SMILES_to_PSMILES(x, "monomer") if type(x) == str else x)
raw_df

In [None]:
# export converted table
# pd.DataFrame.to_csv(raw_df, "./PSMILES_converted_cloud_point_data.csv", sep=";", decimal=",", index=False)
pd.DataFrame.to_csv(raw_df, "./PSMILES_converted_tg_temp_data.csv", sep=";", decimal=",", index=False)

