In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import sys
sys.path.append('src/')
import utils

# Load the new dataframe

In [2]:
with open('config/config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
df = pd.read_csv('Data/training_data_bccdc.csv')

In [4]:
df.head()

Unnamed: 0,Compound,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,PTC Confirmed RT,SMILES
0,Ketamine,TargetPeak,,ms1,238.09932,238.09932,238.09932,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
1,Ketamine,Fragment,1.0,ms2,238.09932,125.01541,125.01541,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
2,Ketamine,Fragment,1.0,ms2,238.09932,179.06221,179.06221,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
3,Ketamine,Fragment,1.0,ms2,238.09932,207.05737,207.05737,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
4,Ketamine,Fragment,1.0,ms2,238.09932,220.08827,220.08827,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl


In [5]:
df['SMILES'].isnull().sum()

5441

In [6]:
df.shape

(10053, 25)

In [7]:
5441/10053

0.541231473192082

There are 5,441 missing entries accounting for 54% of the missing data

# Investigate the Samples we can perform the ML on

In [10]:
df = df[df['PTC Confirmed RT'].notnull()]

In [11]:
df.shape

(866, 25)

In [12]:
df['SMILES'].isnull().sum()

286

In [13]:
286/866

0.3302540415704388

In [14]:
unique_df = df.drop_duplicates(subset=['Compound'])

In [15]:
unique_df.shape

(170, 25)

In [16]:
unique_df['SMILES'].isnull().sum()

56

In [17]:
56/170

0.32941176470588235

In [18]:
missing_smiles_df = unique_df[unique_df['SMILES'].isnull()]

In [19]:
missing_smiles_df.head()

Unnamed: 0,Compound,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,PTC Confirmed RT,SMILES
68,10hydroxycarbazepine,TargetPeak,,ms1,255.1128,255.1128,255.1128,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,4.71,30.0,Individual,4.9,
370,6Monoacetylmorphine,TargetPeak,,ms1,328.15433,328.15433,328.15433,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,3.26,30.0,Individual,3.58,
395,7Aminoclonazepam,TargetPeak,,ms1,286.07417,286.07417,286.07417,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,4.06,30.0,Individual,4.28,
402,7Aminoflunitrazepam,TargetPeak,,ms1,284.11937,284.11937,284.11937,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,4.54,30.0,Individual,4.69,
410,9Hydroxyrisperidone,TargetPeak,,ms1,427.214,427.214,427.214,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,4.46,30.0,Individual,4.63,


In [20]:
missing_smiles_df.to_csv('Data/check.csv', index=False)

In [22]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def get_smiles_with_retry(compound_name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/CanonicalSMILES/TXT"
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    try:
        response = session.get(url)
        response.raise_for_status()
        smiles = response.text.strip()
        if smiles:
            return smiles
        else:
            print(f"No SMILES data found for compound: {compound_name}")
            return None
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {compound_name}: {http_err}")
    except Exception as err:
        print(f"An error occurred for {compound_name}: {err}")
    return None

# Experimenting with smiles2vec

In [36]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

class Smiles2Vec:
    def __init__(self):
        self.vocab = [
            '(', ')', '-', '.', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '@', 'B', 'C', 'F', 'H', 'I', 'N', 
            'O', 'P', 'S', '[', ']', 'a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u'
        ]
        self.tokenizer = Tokenizer(char_level=True)
        self.tokenizer.fit_on_texts(self.vocab)
        self.index_to_char = {index: char for char, index in self.tokenizer.word_index.items()}
    
    def sentence_to_vec(self, sentence):
        # Convert a sentence into a list of character indices
        return self.tokenizer.texts_to_sequences([sentence])[0]

def encode_smiles(smiles_list):
    """
    Encode a list of SMILES strings into feature vectors using a custom Smiles2Vec class.

    Args:
    smiles_list (list of str): List of SMILES strings.

    Returns:
    pd.DataFrame: DataFrame with SMILES encoded into vectors with meaningful feature names.
    """
    # Initialize the Smiles2Vec encoder
    s2v = Smiles2Vec()

    # Convert the list of SMILES strings into feature vectors
    smiles_features = [s2v.sentence_to_vec(smiles) for smiles in smiles_list]

    # Pad sequences to ensure uniform length
    maxlen = max(len(smiles) for smiles in smiles_list)  # Adjust maxlen based on the longest SMILES string
    smiles_features_padded = pad_sequences(smiles_features, maxlen=maxlen, padding='post')
    
    # Create feature names based on the SMILES vocabulary and the compound it is encoding
    feature_names = []
    for i, smiles in enumerate(smiles_list):
        compound_name = f'Compound_{i+1}'
        for j in range(maxlen):
            feature_names.append(f'{compound_name}_Character_{j+1}')
    
    # Convert to DataFrame
    df_features = pd.DataFrame(smiles_features_padded, columns=feature_names[:smiles_features_padded.shape[1]])
    
    return df_features

# Example usage with a DataFrame
if __name__ == "__main__":
    # Sample data
    data = {
        'SMILES': [
            'CCO',
            'CCN(CC)CC',
            'CC(C)C(=O)O',
            'CC(C)O',
            'CC(=O)O',
            # Add more SMILES strings as needed
        ]
    }
    
    df = pd.DataFrame(data)
    
    # Encode the SMILES column
    smiles_encoded_df = encode_smiles(df['SMILES'].tolist())
    
    # Combine the original SMILES column with the encoded features
    df_combined = pd.concat([df, smiles_encoded_df], axis=1)

    print(df_combined)


        SMILES  Compound_1_Character_1  Compound_1_Character_2  \
0          CCO                       2                       2   
1    CCN(CC)CC                       2                       2   
2  CC(C)C(=O)O                       2                       2   
3       CC(C)O                       2                       2   
4      CC(=O)O                       2                       2   

   Compound_1_Character_3  Compound_1_Character_4  Compound_1_Character_5  \
0                       6                       0                       0   
1                       5                       9                       2   
2                       9                       2                      10   
3                       9                       2                      10   
4                       9                      22                       6   

   Compound_1_Character_6  Compound_1_Character_7  Compound_1_Character_8  \
0                       0                       0              

# Using a linkage

In [8]:
linked_df = pd.read_excel('Data/Modeling Data.xlsx')

In [9]:
linked_df.head()

Unnamed: 0,LAB,Compound,RT,DrugClass,InChIKey,InChIKeyShort,SMILES,logD,logP,nO,nC
0,Aarhus,1B-LSD,5.55,Indolalkylamines,SVRFNPSJPIDUBC-DYESRHJHSA-N,SVRFNPSJPIDUBC,C(CCC)(=O)N1C=C2C[C@H]3N(C[C@@H](C=C3C=3C=CC=C...,-0.706483,3.8197,2,24
1,Aarhus,1V-LSD,5.69,Indolalkylamines,GIIBVGJWUZNECE-XMSQKQJNSA-N,GIIBVGJWUZNECE,C(C)N(C(=O)[C@H]1CN([C@@H]2CC3=CN(C4=CC=CC(C2=...,-0.261915,4.2098,2,25
2,Aarhus,1cP-LSD,5.38,Indolalkylamines,RAFUPYYDHPFASC-DYESRHJHSA-N,RAFUPYYDHPFASC,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...,-1.071751,3.4296,2,24
3,Aarhus,1p-LSD,5.21,Indolalkylamines,JSMQOVGXBIDBIE-OXQOHEQNSA-N,JSMQOVGXBIDBIE,O=C([C@H](C=C12)CN(C)[C@]2([H])CC3=CN(C(CC)=O)...,-1.151055,3.4296,2,23
4,Aarhus,2-CMC,2.67,Cathinones,UHVGPEBZWPKWNT-UHFFFAOYSA-N,UHVGPEBZWPKWNT,CNC(C(=O)c1ccccc1Cl)C,-1.020239,2.1306,1,10


We will need to clean the compound the same way

In [10]:
linked_df = linked_df[['Compound', 'SMILES']]

In [12]:
linked_df.head()

Unnamed: 0,Compound,SMILES
0,1B-LSD,C(CCC)(=O)N1C=C2C[C@H]3N(C[C@@H](C=C3C=3C=CC=C...
1,1V-LSD,C(C)N(C(=O)[C@H]1CN([C@@H]2CC3=CN(C4=CC=CC(C2=...
2,1cP-LSD,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...
3,1p-LSD,O=C([C@H](C=C12)CN(C)[C@]2([H])CC3=CN(C(CC)=O)...
4,2-CMC,CNC(C(=O)c1ccccc1Cl)C


In [15]:
linked_df = utils.clean_data(linked_df)

In [16]:
linked_df.head()

Unnamed: 0,Compound,SMILES
0,1BLSD,C(CCC)(=O)N1C=C2C[C@H]3N(C[C@@H](C=C3C=3C=CC=C...
1,1VLSD,C(C)N(C(=O)[C@H]1CN([C@@H]2CC3=CN(C4=CC=CC(C2=...
2,1cPLSD,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...
3,1pLSD,O=C([C@H](C=C12)CN(C)[C@]2([H])CC3=CN(C(CC)=O)...
4,2CMC,CNC(C(=O)c1ccccc1Cl)C


In [30]:
merged_df = pd.merge(df, linked_df, on=['Compound','SMILES'], how='left')

In [31]:
merged_df.head()

Unnamed: 0,Compound,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,PTC Confirmed RT,SMILES
0,Ketamine,TargetPeak,,ms1,238.09932,238.09932,238.09932,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
1,Ketamine,Fragment,1.0,ms2,238.09932,125.01541,125.01541,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
2,Ketamine,Fragment,1.0,ms2,238.09932,179.06221,179.06221,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
3,Ketamine,Fragment,1.0,ms2,238.09932,207.05737,207.05737,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl
4,Ketamine,Fragment,1.0,ms2,238.09932,220.08827,220.08827,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,3.99,CNC1(CCCCC1=O)C2=CC=CC=C2Cl


In [32]:
merged_df['SMILES'].isnull().sum()

5441

In [33]:
merged_df.shape

(10152, 25)

In [34]:
linked_df['Compound']

0              1BLSD
1              1VLSD
2             1cPLSD
3              1pLSD
4               2CMC
            ...     
4765       Pyrazolam
4766    Pyrovalerone
4767         Ro54864
4768          STS135
4769       Tofisopam
Name: Compound, Length: 4770, dtype: object

In [35]:
df['Compound']

0        Ketamine
1        Ketamine
2        Ketamine
3        Ketamine
4        Ketamine
           ...   
10048         NaN
10049         NaN
10050         NaN
10051         NaN
10052         NaN
Name: Compound, Length: 10053, dtype: object

In [36]:
df['Compound'].isnull().sum()

1742

In [37]:
missing_compounds = df[df['Compound'].isnull()]

In [38]:
missing_compounds.head()

Unnamed: 0,Compound,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,PTC Confirmed RT,SMILES
8311,,,,,,,,,,,...,,,,,,,,,,
8312,,Compound Formula,Cas Number,Category,Compound Type,Internal Standard Concentration,ISTD Protein Name,ISTD Compound Name,Ionization Field,Compound Group,...,,,,,,,,,,
8313,,C13H16ClNO,,,eTargetCompound,0,,,ESI,,...,,,,,,,,,,
8314,,C16H21NO3,,,eTargetCompound,0,,,ESI,,...,,,,,,,,,,
8315,,C11H16N2O,,,eTargetCompound,0,,,ESI,,...,,,,,,,,,,


Note that if you review the data this is genuinely missing - need to ask about it

In [39]:
df.dropna(subset=['Compound'], inplace=True)

In [41]:
df.shape

(8311, 25)

In [42]:
df['SMILES'].isnull().sum()

3699

In [43]:
3699/8311

0.44507279509084346

In [45]:
len(linked_df['Compound'].unique())

2434

In [46]:
len(df['Compound'].unique())

1739

In [47]:
no_smiles_df = df[df['SMILES'].isnull()]

In [48]:
no_smiles_df.head()

Unnamed: 0,Compound,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,PTC Confirmed RT,SMILES
6,34Methylene dioxy pyrovalerone,TargetPeak,,ms1,276.15942,276.15942,276.15942,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,4.35,30.0,Individual,,
7,34Methylene dioxy pyrovalerone,Fragment,1.0,ms2,276.15942,126.12778,126.12778,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,4.35,30.0,Individual,,
8,34Methylene dioxy pyrovalerone,Fragment,1.0,ms2,276.15942,135.04404,135.04404,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,4.35,30.0,Individual,,
9,34Methylene dioxy pyrovalerone,Fragment,1.0,ms2,276.15942,175.07532,175.07532,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,4.35,30.0,Individual,,
10,34Methylene dioxy pyrovalerone,Fragment,1.0,ms2,276.15942,205.08583,205.08583,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,4.35,30.0,Individual,,


In [49]:
len(no_smiles_df['Compound'].unique())

841

In [50]:
import pubchempy as pcp

# Original chemical name
compound_name = '3,4-Methylenedioxy-N-methylamphetamine'

# Search compound by name
compound = pcp.get_compounds(compound_name, 'name')

if compound:
    smiles = compound[0].canonical_smiles
    print(f"SMILES for {compound_name}: {smiles}")
else:
    print(f"Compound {compound_name} not found.")

SMILES for 3,4-Methylenedioxy-N-methylamphetamine: CC(CC1=CC2=C(C=C1)OCO2)NC


In [58]:
other_df = pd.read_csv('libraries/Training_Database_with_fbs_rt.csv', skiprows=5)

In [59]:
other_df.head()

Unnamed: 0,Compound Name,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Target Ratio,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,Compound,PTC Confirmed RT
0,(-)-Ketamine,TargetPeak,,ms1,238.09932,238.09932,238.09932,5000,5000,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99
1,(-)-Ketamine,Fragment,1.0,ms2,238.09932,125.01541,125.01541,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99
2,(-)-Ketamine,Fragment,1.0,ms2,238.09932,179.06221,179.06221,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99
3,(-)-Ketamine,Fragment,1.0,ms2,238.09932,207.05737,207.05737,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99
4,(-)-Ketamine,Fragment,1.0,ms2,238.09932,220.08827,220.08827,5000,0,0,...,,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99


In [78]:
import pickle

In [124]:
def compound_name_to_smiles(compound):
    """
    Parameters
    ----------
        compound : String
            Compound Name
    Returns
    -------
        smiles : String
            SMILEs in str format
    """
    def name_to_smiles(name):
        try:
            compound = pcp.get_compounds(name, 'name')
            if compound:
                return compound[0].canonical_smiles
            else:
                return None
        except Exception as e:
            return None

    if isinstance(compound, pd.Series):
        return compound.apply(name_to_smiles)
    elif isinstance(compound, str):
        return name_to_smiles(compound)
    else:
        raise ValueError("Input must be a string or a pandas Series.")

def add_smiles_to_data(data, compound_col = 'Compound'):
    """
    Parameters
    ----------
        data : pd.DataFrame
            Dataframe with a compound column (default: `Column`)
    Returns
    -------
        data : pd.DataFrame
            The same dataframe with a  `SMILES` column         
    """
    # Extract unique compound names
    unique_compounds = data[compound_col].unique()
    
    if os.path.exists(os.path.join('libraries', 'smiles_dict.pkl')):
        # If the pickle file exists, load the dictionary from the file
        with open(os.path.join('libraries', 'smiles_dict.pkl'), 'rb') as f:
            smiles_dict = pickle.load(f)
    else:
        # Apply the SMILES conversion function to unique compound names
        smiles_dict = {compound: compound_name_to_smiles(compound) for compound in unique_compounds}
    
        # Save the smiles_dict to a pickle file
        with open(os.path.join('libraries', 'smiles_dict.pkl'), 'wb') as f:
            pickle.dump(smiles_dict, f)
    # Map the SMILES strings back to the original DataFrame
    data['SMILES'] = data[compound_col].map(smiles_dict)
    
    return data

In [125]:
other_df = add_smiles_to_data(other_df, 'Compound')


ValueError: Input must be a string or a pandas Series.

In [109]:
other_df['SMILES'].isnull().sum()

0

In [110]:
other_df['Compound'].value_counts()

Compound
Ketamine                12
Metanephrine - H2O       6
Methiocarb               6
Methenolone              6
Methedrone               6
                        ..
Nor-delta9-THC-COOH      1
Nitrazepam-d5            1
7-Aminoclonazepam-d4     1
Ibuprofen                1
Benzylone                1
Name: count, Length: 1739, dtype: int64

In [111]:
other_df[['Compound', 'SMILES']].to_csv('Data/check.csv', index=False)

In [112]:
trial_df = pd.read_csv('Data/check.csv')

In [113]:
trial_df.head()

Unnamed: 0,Compound,SMILES
0,Ketamine,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
1,Ketamine,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
2,Ketamine,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
3,Ketamine,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
4,Ketamine,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"


In [105]:
other_df.head()

Unnamed: 0,Compound Name,Workflow,Associated Target Peak,MS Order,Precursor m/z,Product m/z,m/z,Height Threshold,Area Threshold,Collision Energy,...,Window Type,PeakPolarity,Adduct,Charge State,Retention Time,Retention Time Window,Integration Strategy,Compound,PTC Confirmed RT,SMILES
0,(-)-Ketamine,TargetPeak,,ms1,238.09932,238.09932,238.09932,5000,5000,0,...,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
1,(-)-Ketamine,Fragment,1.0,ms2,238.09932,125.01541,125.01541,5000,0,0,...,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
2,(-)-Ketamine,Fragment,1.0,ms2,238.09932,179.06221,179.06221,5000,0,0,...,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
3,(-)-Ketamine,Fragment,1.0,ms2,238.09932,207.05737,207.05737,5000,0,0,...,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"
4,(-)-Ketamine,Fragment,1.0,ms2,238.09932,220.08827,220.08827,5000,0,0,...,eAbsolute,Positive,M+H,1.0,3.77,30.0,Individual,Ketamine,3.99,"{'K': '[K]', 'i': 'II', 'n': 'N#N'}"


In [88]:
trial_df['SMILES'].isnull().sum()

0

In [74]:
trial_df.shape

(1739, 2)

In [75]:
399/1739

0.22944220816561242

Looks like we have 22% missingness

In [76]:
import re

def format_for_pubchem(name):
    # Remove special characters and spaces
    formatted_name = re.sub(r'[^\w\s]', '', name)
    # Replace numbers at the beginning with words
    formatted_name = re.sub(r'\b(\d+)\b', lambda x: num2words(int(x.group(0))), formatted_name)
    # Capitalize each word and join with spaces
    formatted_name = ' '.join(word.capitalize() for word in formatted_name.split())
    return formatted_name