In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Runs on mdu kernel 

import datasets
import os
import yaml
import pandas as pd
from tqdm import tqdm
from datasets import concatenate_datasets
from PIL import Image, ImageDraw, ImageOps
from rdkit import Chem 

from docmarker.text_generation import DescriptionGenerator
from docmarker.image_text_merging import ImageTextMerger
from mol_depict_cdk.cxsmiles_tokenizer import CXSMILESTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Convert abbreviation list

In [27]:
abbreviations = pd.read_csv(os.getcwd() + "/../../data/abbreviations_2.csv")
max_abbreviation_index = 150

valences = []
smiles_list = []
abbreviations_list = []
for i, row in abbreviations.iterrows():
    if i > max_abbreviation_index:
        continue 
    valence = row["smiles"].count("*")
    if (valence == 0):
        continue
    valences.append(valence)
    smiles_list.append(row["smiles"])
    abbreviations_list.append(row["abbreviation"])

#### SMILES

In [28]:
d = pd.DataFrame({
    "value": smiles_list,
    "valence": valences 
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_abbreviation_smiles.csv", index=False)

#### Names

In [29]:
d = pd.DataFrame({
    "value": abbreviations_list,
    "valence": valences 
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_abbreviation_name.csv", index=False)

In [38]:
# Note for ingmar (now fixed)
for i, (abb, smi, v) in enumerate(zip(abbreviations["abbreviation"], abbreviations["smiles"], valences)):
    if (v != 1) and (v != 2):
        print(i, abb, smi)

1680 COO-t-Bu C(=O)OC(C)(C)C
1943 CO2-t-Bu C(=O)OC(C)(C)C
2032 (CH2)4COOH CCCCC(=O)O
2162 NHCO2Bn NC(=O)OCC1=CC=CC=C1
2207 COOC4H9(n) C(=O)OCCCC
2366 CH2CCl3 CC(Cl)(Cl)Cl


### Convert functional group list

In [23]:
def count_terminal_carbons(molecule):
    nb_terminal_carbons = 0
    for atom in molecule.GetAtoms():
        if atom.GetAtomicNum() == 6 and len(atom.GetNeighbors()) == 1:
            nb_terminal_carbons += 1
    return nb_terminal_carbons 

In [30]:
functional_groups = pd.read_csv(os.getcwd() + "/../../data/functional_groups.csv")
max_functional_group_index = 250

valences = []
substructures = []
smiles_list = []
for i, row in functional_groups.iterrows():
    if i > max_functional_group_index:
        continue
    molecule = Chem.MolFromSmiles(row["SMILES"])
    valence = count_terminal_carbons(molecule)
    valences.append(valence)
    
    substructure = row["Substructure"]
    substructure = substructure.replace("_", " ")
    substructures.append(substructure)
    smiles_list.append(row["SMILES"])

#### SMILES

In [31]:
d = pd.DataFrame({
    "value": smiles_list,
    "valence": valences 
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_functional_group_smiles.csv", index=False)

#### Names

In [32]:
d = pd.DataFrame({
    "value": substructures,
    "valence": valences 
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_functional_group_name.csv", index=False)

### Create atom list

In [5]:
def get_hydrogen_variants(atom_symbol, max_valence):
    variants = []
    valences = []
    for h_count in range(max_valence):
        if h_count == 0:
            variants.append(atom_symbol)
            valences.append(max_valence - h_count)
        elif h_count == 1:
            variants.append(f"H{atom_symbol}")
            valences.append(max_valence - h_count)
            variants.append(f"{atom_symbol}H")
            valences.append(max_valence - h_count)
        else:
            variants.append(f"H{h_count}{atom_symbol}")
            valences.append(max_valence - h_count)
            variants.append(f"{atom_symbol}H{h_count}")
            valences.append(max_valence - h_count)
    return variants, valences

#### SMILES

In [6]:
periodic_table = Chem.GetPeriodicTable()
max_atomic_number = 50

atoms = []
valences = []
for atomic_number in range(1, max_atomic_number + 1):
    #max_valence =  max(periodic_table.GetValenceList(atomic_number))
    max_valence = periodic_table.GetDefaultValence(atomic_number)
    if max_valence < 1:
        continue 
    
    # Atom atom symbol and hydrogen variants
    atom_symbol = periodic_table.GetElementSymbol(atomic_number)
    #max_valence =  max(periodic_table.GetValenceList(atomic_number))
    max_valence = periodic_table.GetDefaultValence(atomic_number)
    atom_variants, atom_variants_valences = get_hydrogen_variants(atom_symbol, max_valence)
    atoms.extend(atom_variants)
    valences.extend(atom_variants_valences)
    
d = pd.DataFrame({
    "value": atoms,
    "valence": valences
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_atom_smiles.csv", index=False)

#### Names

In [37]:
periodic_table = Chem.GetPeriodicTable()
max_atomic_number = 50

atoms = []
valences = []
for atomic_number in range(1, max_atomic_number + 1):
    max_valence =  max(periodic_table.GetValenceList(atomic_number))
    if max_valence < 1:
        continue 
    
    # Add atom name
    atoms.append(periodic_table.GetElementName(atomic_number))
    valences.append(max_valence)
    
d = pd.DataFrame({
    "value": atoms,
    "valence": valences
})
d.to_csv(os.getcwd() + "/../../data/text_mappings/s_component_atom_name.csv", index=False)