In [1]:
# Explanatory notebook to convert this database with specific (radical) repetition units and end groups to the PSMILES of the Ramprasad-Group
import os
import pandas as pd
import re
import warnings

In [2]:
# RAW_CSV_PATH = os.path.join("../" + "cloud_point" + "./cloud_points_data.csv")
RAW_CSV_PATH = "./glass_transition_temperature_data.csv"

raw_df = pd.read_csv(RAW_CSV_PATH, sep=";", decimal=",")
raw_df

Unnamed: 0,reference,identifier,polymer_type,polymer_type_style,polymerization_type,SMILES_start,SMILES_end,SMILES_rep_u1,molpercent_rep_u1,SMILES_rep_u2,...,add2,add2_percent,Glass_temperature,2nd_Glass_temperature,tg_def_type,Decomposition temp,td_def_type,Unnamed: 29,Comment,rating
0,10.3390/polym14030361,TSU-075-A,Homo,,RAFT,[C](C)(C)(C#N),[S]C(=S)c1ccccc1,[CH2][C](C)(C(=O)OC),1.00,,...,,,90.9,,C,,,,,
1,10.3390/polym14030361,TSU-075-AB,Co,,RAFT,[C](C)(C)(C#N),[S]C(=S)c1ccccc1,[CH2][C](C)(C(=O)OC),0.50,[CH2][C](C)(C(=O)OCC),...,,,72.2,,C,,,,,
2,10.3390/polym14030361,TSU-075-AC,Co,,RAFT,[C](C)(C)(C#N),[S]C(=S)c1ccccc1,[CH2][C](C)(C(=O)OC),0.50,[CH2][C](C)(C(=O)OCCO),...,,,106.2,,C,,,,,
3,10.3390/polym14030361,TSU-075-AD,Co,,RAFT,[C](C)(C)(C#N),[S]C(=S)c1ccccc1,[CH2][C](C)(C(=O)OC),0.50,[CH2][C](C)(C(=O)OCCC),...,,,72.4,,C,,,,,
4,10.3390/polym14030361,TSU-075-AE,Co,,RAFT,[C](C)(C)(C#N),[S]C(=S)c1ccccc1,[CH2][C](C)(C(=O)OC),0.50,[CH2][C](C)(C(=O)OC(C)C),...,,,76.4,,C,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,10.1039/C9PY00875F,P5 - LLa-EtGly,Co,random,ROP,c1ccccc1CO,[H],[C](=O)C(C)OC(=O)C(C)[O],0.89,[C](=O)COC(=O)C(CC)[O],...,,,52.0,,C,,,,,
441,10.1039/C9PY00875F,P6 - LLa-EtGly,Co,random,ROP,c1ccccc1CO,[H],[C](=O)C(C)OC(=O)C(C)[O],0.78,[C](=O)COC(=O)C(CC)[O],...,,,48.0,,C,,,,,
442,10.1039/C9PY00875F,P7 - DLa-EtGly,Co,random,ROP,c1ccccc1CO,[H],[C](=O)C(C)OC(=O)C(C)[O],0.96,[C](=O)COC(=O)C(CC)[O],...,,,53.0,,C,,,,,
443,10.1039/C9PY00875F,P8 - DLa-EtGly,Co,random,ROP,c1ccccc1CO,[H],[C](=O)C(C)OC(=O)C(C)[O],0.91,[C](=O)COC(=O)C(CC)[O],...,,,51.0,,C,,,,,


In [3]:
# transform every monomers  "[atoms and numbers]" string-part to "*atoms and numbers" e.g.:
# [CH2][CH](C(=O)NC1CCCCCC1) -> [*]CC[*](C(=O)NC1CCCCCC1)
# therefore all "[content]" must be searched then
    # the first must be replaced with "[*]content-H and following numbers"
    # the second must be replaced with "content-h and following numbers[*]"
def convert_explicit_SMILES_to_PSMILES(explicit_SMILES: str, chain_pos: str = "monomer"):
    all_explicits = re.findall(r"\[[^\]\*]*\]", explicit_SMILES)
    replacements = [radical.replace("[", "").replace("]", "") for radical in all_explicits]
    replacements = [re.sub("H[0-9]?", "", radical) for radical in replacements]
    if len(all_explicits) == 0:
        return explicit_SMILES
    PSMILES = explicit_SMILES
    match chain_pos:
        case "monomer":
            replacements[0] = "[*]" + replacements[0]
            replacements[1] = replacements[1] + "[*]"
            PSMILES = PSMILES.replace(all_explicits[0], replacements[0], 1)
            PSMILES = PSMILES.replace(all_explicits[1], replacements[1], 1)
            if len(all_explicits) != 2:
                warnings.warn("\n" + explicit_SMILES + ": " + "Too many OR little explicits found. Is there a radical in the monomer, or is this an end group?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")
        case "end":
            replacements[0] = "[*]" + replacements[0]
            PSMILES = PSMILES.replace(all_explicits[0], replacements[0], 1)
            if len(all_explicits) != 1:
                warnings.warn("\n" + explicit_SMILES + ": " + "None or too many explicits found. Is there a radical in the end group, or is it a monomer?" +
                              " Is " + "\n" + PSMILES + " the right replacement?")

    return PSMILES
convert_explicit_SMILES_to_PSMILES("[CH2][C](C)(C(=O)OCC[N+](C)(C)CCCS(=O)(=O)[O-]","monomer")

[CH2][C](C)(C(=O)OCC[N+](C)(C)CCCS(=O)(=O)[O-]: Too many OR little explicits found. Is there a radical in the monomer, or is this an end group? Is 
[*]CC[*](C)(C(=O)OCC[N+](C)(C)CCCS(=O)(=O)[O-] the right replacement?


'[*]CC[*](C)(C(=O)OCC[N+](C)(C)CCCS(=O)(=O)[O-]'

In [6]:
# get all headers with SMILES and convert them to PSMILES, also split between SMILES + repeating and none
# end_group_headers = [header for header in raw_df.columns if "end" and "group" in header]
end_group_headers = ["SMILES_start", "SMILES_end"]
# monomer_headers = [header for header in raw_df.columns if "SMILES_repeating" in header]
monomer_headers = [header for header in raw_df.columns if "SMILES_rep" in header]
print(end_group_headers, monomer_headers)
for end_group in end_group_headers:
    raw_df[end_group] = raw_df[end_group].apply(lambda x: convert_explicit_SMILES_to_PSMILES(x, "end") if type(x) == str else x)
for monomer in monomer_headers:
    raw_df[monomer] = raw_df[monomer].apply(lambda x: convert_explicit_SMILES_to_PSMILES(x, "monomer") if type(x) == str else x)
raw_df

['SMILES_start', 'SMILES_end'] ['SMILES_rep_u1', 'SMILES_rep_u2', 'SMILES_rep_u3']


[CH2][C](C)(C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C): Too many OR little explicits found. Is there a radical in the monomer, or is this an end group? Is 
[*]CC[*](C)(C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C) the right replacement?
[CH2][CH](C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C): Too many OR little explicits found. Is there a radical in the monomer, or is this an end group? Is 
[*]CC[*](C(=O)OC1C[C@H]2CC[C@]1(C)C2(C)C) the right replacement?
[CH2][CH](c1ccc([Cl])cc1): Too many OR little explicits found. Is there a radical in the monomer, or is this an end group? Is 
[*]CC[*](c1ccc([Cl])cc1) the right replacement?


Unnamed: 0,reference,identifier,polymer_type,polymer_type_style,polymerization_type,SMILES_start,SMILES_end,SMILES_rep_u1,molpercent_rep_u1,SMILES_rep_u2,...,add2,add2_percent,Glass_temperature,2nd_Glass_temperature,tg_def_type,Decomposition temp,td_def_type,Unnamed: 29,Comment,rating
0,10.3390/polym14030361,TSU-075-A,Homo,,RAFT,[*]C(C)(C)(C#N),[*]SC(=S)c1ccccc1,[*]CC[*](C)(C(=O)OC),1.00,,...,,,90.9,,C,,,,,
1,10.3390/polym14030361,TSU-075-AB,Co,,RAFT,[*]C(C)(C)(C#N),[*]SC(=S)c1ccccc1,[*]CC[*](C)(C(=O)OC),0.50,[*]CC[*](C)(C(=O)OCC),...,,,72.2,,C,,,,,
2,10.3390/polym14030361,TSU-075-AC,Co,,RAFT,[*]C(C)(C)(C#N),[*]SC(=S)c1ccccc1,[*]CC[*](C)(C(=O)OC),0.50,[*]CC[*](C)(C(=O)OCCO),...,,,106.2,,C,,,,,
3,10.3390/polym14030361,TSU-075-AD,Co,,RAFT,[*]C(C)(C)(C#N),[*]SC(=S)c1ccccc1,[*]CC[*](C)(C(=O)OC),0.50,[*]CC[*](C)(C(=O)OCCC),...,,,72.4,,C,,,,,
4,10.3390/polym14030361,TSU-075-AE,Co,,RAFT,[*]C(C)(C)(C#N),[*]SC(=S)c1ccccc1,[*]CC[*](C)(C(=O)OC),0.50,[*]CC[*](C)(C(=O)OC(C)C),...,,,76.4,,C,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,10.1039/C9PY00875F,P5 - LLa-EtGly,Co,random,ROP,c1ccccc1CO,[*],[*]C(=O)C(C)OC(=O)C(C)O[*],0.89,[*]C(=O)COC(=O)C(CC)O[*],...,,,52.0,,C,,,,,
441,10.1039/C9PY00875F,P6 - LLa-EtGly,Co,random,ROP,c1ccccc1CO,[*],[*]C(=O)C(C)OC(=O)C(C)O[*],0.78,[*]C(=O)COC(=O)C(CC)O[*],...,,,48.0,,C,,,,,
442,10.1039/C9PY00875F,P7 - DLa-EtGly,Co,random,ROP,c1ccccc1CO,[*],[*]C(=O)C(C)OC(=O)C(C)O[*],0.96,[*]C(=O)COC(=O)C(CC)O[*],...,,,53.0,,C,,,,,
443,10.1039/C9PY00875F,P8 - DLa-EtGly,Co,random,ROP,c1ccccc1CO,[*],[*]C(=O)C(C)OC(=O)C(C)O[*],0.91,[*]C(=O)COC(=O)C(CC)O[*],...,,,51.0,,C,,,,,


In [7]:
# export converted table
# pd.DataFrame.to_csv(raw_df, "./PSMILES_converted_cloud_point_data.csv", sep=";", decimal=",", index=False)
pd.DataFrame.to_csv(raw_df, "./PSMILES_converted_tg_temp_data.csv", sep=";", decimal=",", index=False)

