In [1]:
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict

In [2]:
df = pd.read_excel('atomic masses.xlsx')
df

Unnamed: 0,Element,Symbol,Atomic Number,Atomic Mass†
0,Actinium,Ac,89,-227.00
1,Aluminum,Al,13,26.98
2,Americium,Am,95,-243.00
3,Antimony,Sb,51,121.80
4,Argon,Ar,18,39.95
...,...,...,...,...
104,Xenon,Xe,54,131.30
105,Ytterbium,Yb,70,173.00
106,Yttrium,Y,39,88.91
107,Zinc,Zn,30,65.41


In [3]:
mass_df = df[df.columns[[1, 3]]]
mass_df

Unnamed: 0,Symbol,Atomic Mass†
0,Ac,-227.00
1,Al,26.98
2,Am,-243.00
3,Sb,121.80
4,Ar,39.95
...,...,...
104,Xe,131.30
105,Yb,173.00
106,Y,88.91
107,Zn,65.41


In [4]:
sym_to_mass = {}

for i in range(mass_df.shape[0]):
    sym = mass_df.iloc[i, 0]
    mass = mass_df.iloc[i, 1]
    sym_to_mass[sym] = mass

In [5]:
s = ['C1OC1', 'C1CNC1', 'OO', 'N=[N+]=[N-]', 'N1=CC=CO1']
data, formula = 'O1N=C(C=C1)C(C1)=CC(=CC=1N=[N+]=[N-])CC' , 'C11H10N4O'
# data, formula = 'C1N(CC1)CC1=CC(=CC=C1)Br', 'C10H12BrN'
# data, formula = 'O1C(C1)C1=CC(=CC=C1)C(OO)=O', 'C9H8O4'

In [6]:
def find_part(data, s):
    """Return a smile part that was found.

    @param1 : data -> input data SMILE
    @param2 : s -> 
    """
    count = 0
    for i in range(len(s)):
        if s[i] in data:
            count += 1
            print(f"{s[i]} found in {data}")
    
    return count


In [7]:
def rule_six(data, hefg):
    """Return a integer.

    @param1 : data -> input data SMILE
    @param2 : hefg -> 
    """
    count = 0
    for i in range(len(data)):
        if data[i] == 'C':
            count += 1
    
    x = count / hefg

    if x < 6:
        return 'RED'
    
    return 'GREEN'

In [8]:
def countOfAtoms(formula: str) -> str:
    dt = defaultdict(int)
    stack = [1]
    digits = ""
    lowers = ""
    for element in formula[::-1]:
        if element.isdigit():
            digits = element + digits
        elif element.islower():
            lowers = element + lowers
        elif element == ")":
            stack.append(stack[-1]*int(digits or 1))
            digits = ""
        elif element == "(":
            stack.pop()
        #if element is an uppercase letter
        else:
            element = element + lowers
            dt[element] = dt[element]+stack[-1]*(int(digits or 1))
            digits = ""
            lowers = ""
    result = []
    for key, value in sorted(dt.items()):
        if value == 1:
            value = ""
        result.append(key)
        result.append(str(value))
    
    atoms = []
    for i in result:
        if i == '':
            atoms.append('1')
        else:
            atoms.append(i)
    return atoms

In [9]:
def molecule_weight(sym_to_mass, atoms):
    mass = 0
    for i in range(len(atoms)):
        if atoms[i] in sym_to_mass:
            m = sym_to_mass[atoms[i]]
            a = int(atoms[i + 1])
            mass =  mass + a*m

    return mass 
            

In [10]:
def oxy_params(atoms):
    for i in range(len(atoms)):
        if atoms[i] == 'C':
            x = int(atoms[i + 1])
        elif atoms[i] == 'H':
            y = int(atoms[i + 1])
        elif atoms[i] == 'O':
            z = int(atoms[i + 1])
    
    return x, y, z

In [11]:
def oxygen_balance(sym_to_mass, formula):
    atoms = countOfAtoms(formula)  
    mw = molecule_weight(sym_to_mass, atoms)
    x, y, z = oxy_params(atoms)
    oxy_bal = round(-1600 * (x + y/2 -z) / mw)
    return oxy_bal

In [12]:
hefg = find_part(data, s)
rule = rule_six(data, hefg)
oxy = oxygen_balance(sym_to_mass, formula)

N=[N+]=[N-] found in O1N=C(C=C1)C(C1)=CC(=CC=1N=[N+]=[N-])CC


In [13]:
hefg, rule, oxy

(1, 'GREEN', -112)

In [14]:
# 1. what if no hefg found then rule six will give division by 0 error.

In [15]:
mass_df

Unnamed: 0,Symbol,Atomic Mass†
0,Ac,-227.00
1,Al,26.98
2,Am,-243.00
3,Sb,121.80
4,Ar,39.95
...,...,...
104,Xe,131.30
105,Yb,173.00
106,Y,88.91
107,Zn,65.41


In [17]:
smile = 'O1N=C(C=C1)C(C1)=CC(=CC=1N=[N+]=[N-])CC'

In [56]:
def calculate_formula(smile, mass_df):
    n_smile = re.sub('[\[\]\(\)\+\=\-]',  '', smile)
    s = ''.join([i for i in n_smile if not i.isdigit()])
    formula = {}
    for i in s:
        if mass_df['Symbol'].str.contains(i).any():
            if i in formula:
                formula[i] += 1
            else:
                formula[i] = 1
    
    return formula

In [57]:
f = calculate_formula(smile, mass_df)
f

{'O': 1, 'N': 4, 'C': 11}

In [25]:
if mass_df['Symbol'].str.contains('O').any():
    print('present')

present


In [32]:
import re

In [49]:
# ('\ |\?|\.|\!|\/|\;|\:', '', line)
n_smile = re.sub('[\[\]\(\)\+\=\-]',  '', smile)
n_smile

'O1NCCC1CC1CCCC1NNNCC'

In [35]:
smile

'O1N=C(C=C1)C(C1)=CC(=CC=1N=[N+]=[N-])CC'

In [50]:
s = ''.join([i for i in n_smile if not i.isdigit()])
s

'ONCCCCCCCCCNNNCC'

In [58]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

my_smiles_string = 'C1=CC(=C(C=C1C(CN)O)O)O'
my_mol = Chem.MolFromSmiles(my_smiles_string)
print(rdMolDescriptors.CalcMolFormula(my_mol))

ModuleNotFoundError: No module named 'rdkit'