## Merge the curated PubChem and Inclusive Datasets

In [1]:
import numpy as np
import pandas as pd

In [2]:
inclusiveDB = pd.read_csv("uaa_data/inclusive_db.csv", sep=";")
pubchemDB = pd.read_csv("uaa_data/ncaa_pubchem.csv")

In [3]:
inclusiveDB.columns

Index(['ID', 'ncAA abbreviation(s) used in the publication',
       'ncAA name, as mentioned in the publication', 'ncAA IUPAC name',
       'ncAA SMILES notation', 'ncAA chemical formula', 'ncAA Structure',
       'ncAA link for Pubchem, if available',
       'Application, if provided in the publication',
       'Canonical amino acid most similar to the ncAA',
       'aaRS ID (abbr. organism, abbr. natural substrate, RS, mutations if any)',
       'aaRS origin organism full name',
       'Amino acid sequence of (mutated) aaRS',
       'tRNA ID (abbr. organism, tRNA, natural AA transported, anticodon)',
       'tRNA organism', 'tRNA sequence', 'Codon suppression',
       'Tested in (protein)', 'Tested in (protein position)',
       'Tested in (organism/in vitro)', 'Comment (if applicable)',
       'Publication', 'DOI', 'Publication year'],
      dtype='object')

In [4]:
pubchemDB.columns

Index(['QueryName', 'CID', 'Name', 'MolecularFormula', 'MolecularWeight',
       'IUPACName', 'CanonicalSMILES', 'InChIKey', 'PubChemURL'],
      dtype='object')

In [5]:
## Merge on IUPACName
merged_inclusive_pubchem = inclusiveDB.merge(pubchemDB, left_on = 'ncAA IUPAC name', right_on = 'IUPACName')
merged_inclusive_pubchem

Unnamed: 0,ID,ncAA abbreviation(s) used in the publication,"ncAA name, as mentioned in the publication",ncAA IUPAC name,ncAA SMILES notation,ncAA chemical formula,ncAA Structure,"ncAA link for Pubchem, if available","Application, if provided in the publication",Canonical amino acid most similar to the ncAA,...,Publication year,QueryName,CID,Name,MolecularFormula,MolecularWeight,IUPACName,CanonicalSMILES,InChIKey,PubChemURL
0,1,not available,O-methyl-l-tyrosine,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,COC1=CC=C(C=C1)CC(C(=O)O)N,C10H13NO3,,https://pubchem.ncbi.nlm.nih.gov/compound/2723935,not specified,Tyrosine,...,2001,O-Methyl-L-tyrosine,2723935,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,C10H13NO3,195.21,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,,GEYBMYRBIABFTA-VIFPVBQESA-N,https://pubchem.ncbi.nlm.nih.gov/compound/2723935
1,1,not available,O-methyl-l-tyrosine,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,COC1=CC=C(C=C1)CC(C(=O)O)N,C10H13NO3,,https://pubchem.ncbi.nlm.nih.gov/compound/2723935,not specified,Tyrosine,...,2001,O-methyl-L-tyrosine,2723935,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,C10H13NO3,195.21,(2S)-2-amino-3-(4-methoxyphenyl)propanoic acid,,GEYBMYRBIABFTA-VIFPVBQESA-N,https://pubchem.ncbi.nlm.nih.gov/compound/2723935
2,3,pAzPhe,p-azido-l-phenylalanine,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C1=CC(=CC=C1CC(C(=O)O)N)N=[N+]=[N-],C9H10N4O2,,https://pubchem.ncbi.nlm.nih.gov/compound/3080772,crosslinking,Phenylalanine,...,2002,p-Azido-L-phenylalanine,3080772,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C9H10N4O2,206.20,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,,NEMHIKRLROONTL-QMMMGPOBSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/3080772
3,4,pAzPhe,p-azido-l-phenylalanine,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C1=CC(=CC=C1CC(C(=O)O)N)N=[N+]=[N-],C9H10N4O2,,https://pubchem.ncbi.nlm.nih.gov/compound/3080772,crosslinking,Phenylalanine,...,2002,p-Azido-L-phenylalanine,3080772,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C9H10N4O2,206.20,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,,NEMHIKRLROONTL-QMMMGPOBSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/3080772
4,5,pAzPhe,p-azido-l-phenylalanine,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C1=CC(=CC=C1CC(C(=O)O)N)N=[N+]=[N-],C9H10N4O2,,https://pubchem.ncbi.nlm.nih.gov/compound/3080772,crosslinking,Phenylalanine,...,2002,p-Azido-L-phenylalanine,3080772,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C9H10N4O2,206.20,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,,NEMHIKRLROONTL-QMMMGPOBSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/3080772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,2285,pBzF,p-benzoylphenylalanine,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2)CC(C(=O)O)N,C16H15NO3,,https://pubchem.ncbi.nlm.nih.gov/compound/7020128,labelling,Phenylalanine,...,2023,4-Benzoyl-L-phenylalanine,7020128,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,C16H15NO3,269.29,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,,TVIDEEHSOPHZBR-AWEZNQCLSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/7020128
857,2295,AcK,N-ε-acetyl-lysine,(2S)-6-acetamido-2-aminohexanoic acid,CC(=O)NCCCCC(C(=O)O)N,C8H16N2O3,,https://pubchem.ncbi.nlm.nih.gov/compound/92832,post translational modification mimic,Lysine,...,2023,N-epsilon-acetyl-L-lysine,92832,(2S)-6-acetamido-2-aminohexanoic acid,C8H16N2O3,188.22,(2S)-6-acetamido-2-aminohexanoic acid,,DTERQYGMUDWYAZ-ZETCQYMHSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/92832
858,2391,pAzF,p-azido-L-phenylalanine,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C1=CC(=CC=C1CC(C(=O)O)N)N=[N+]=[N-],C9H10N4O2,,https://pubchem.ncbi.nlm.nih.gov/compound/3080772,photo-crosslinking,Phenylalanine,...,2023,p-Azido-L-phenylalanine,3080772,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,C9H10N4O2,206.20,(2S)-2-amino-3-(4-azidophenyl)propanoic acid,,NEMHIKRLROONTL-QMMMGPOBSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/3080772
859,2413,pBpA,para-benzoyl-phenylalanine,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2)CC(C(=O)O)N,C16H15NO3,,https://pubchem.ncbi.nlm.nih.gov/compound/7020128,not specified,Phenylalanine,...,2023,p-Benzoyl-L-phenylalanine,7020128,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,C16H15NO3,269.29,(2S)-2-amino-3-(4-benzoylphenyl)propanoic acid,,TVIDEEHSOPHZBR-AWEZNQCLSA-N,https://pubchem.ncbi.nlm.nih.gov/compound/7020128


In [6]:
## How about with SwissSideChain?
import os
import pandas as pd

SSC_SMI_DIR = "/Users/sanjitrao/Downloads/L_SMI"


def fix_scc_smiles(s):
    s = s.replace("[NH3]", "[NH3+]")
    s = s.replace("[Cl](=O)=O", "ClS(=O)=O")  # example fix; adjust as needed
    return s

def load_ssc_smiles(smidir: str):
    rows = []
    for fname in os.listdir(smidir):
        if fname.endswith(".smi"):
            path = os.path.join(smidir, fname)
            with open(path, "r") as f:
                line = f.read().strip()

                if not line:
                    continue

                # typical format: "<smiles> <code>"
                parts = line.split()
                smiles = parts[0]
                code = parts[-1]  # 3-letter / extended code ("YCM", "WFP", etc.)

                rows.append((code, smiles))

    data =  pd.DataFrame(rows, columns=["ssc_code", "ssc_smiles"])
    data['ssc_code'] = data['ssc_code'].str.replace('.pdb', '', regex=False)


    data['ssc_smiles'] = data['ssc_smiles'].apply(fix_scc_smiles)
    return data


SSC = load_ssc_smiles(SSC_SMI_DIR)
SSC.head()


Unnamed: 0,ssc_code,ssc_smiles
0,ALN,O=[C](=O)[C@H](Cc1cccc2c1cccc2)[NH3+]
1,OCY,OCCSC[C@@H]([C](=O)=O)[NH3+]
2,0AF,[NH3+][C@H]([C](=O)=O)Cc1c[nH]c2c1cccc2O
3,26P,OC(=O)[C@H](CCC[C](=[C](=O)=O)=O)[NH3+]
4,BHD,O=[C](=O)[C@H]([C@@H]([C](=O)=O)[NH3+])O


In [7]:
merged_inclusive_pubchem_ssc = merged_inclusive_pubchem.merge(SSC, left_on = 'ncAA SMILES notation', right_on = 'ssc_smiles')
merged_inclusive_pubchem_ssc

Unnamed: 0,ID,ncAA abbreviation(s) used in the publication,"ncAA name, as mentioned in the publication",ncAA IUPAC name,ncAA SMILES notation,ncAA chemical formula,ncAA Structure,"ncAA link for Pubchem, if available","Application, if provided in the publication",Canonical amino acid most similar to the ncAA,...,CID,Name,MolecularFormula,MolecularWeight,IUPACName,CanonicalSMILES,InChIKey,PubChemURL,ssc_code,ssc_smiles
