In [None]:
import pandas as pd
import itertools
from rdkit import Chem
from rdkit.Chem import Draw
import re
from rdkit.Chem.Draw import IPythonConsole


import sys


pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None) 
df = pd.read_excel('data/second_order.xlsx', sheet_name='Sheet1')

df.index = df['No']
groups = df['Group']
omitted = 0

df = groups.str.extract(r'(?P<query>\S*)\s(?P<range_str>\[.*\])*')
reduced_df = df.dropna()
print(f"Number of groups with permutations: {len(reduced_df.index)}")
reduced_df.loc[:, 0] = reduced_df['query'].str.rstrip()
print(f"Number of groups with semi-colon permutations: {len(reduced_df[reduced_df['range_str'].str.contains(';')].index)}")
omitted += len(reduced_df[reduced_df['range_str'].str.contains(';')].index)
no_semi = reduced_df[~reduced_df['range_str'].str.contains(';')]

def map_ranges(df):
  range_str = df['range_str']
  df_sub = range_str.str.extract(r'\[(?P<var>[\W,\d\S]*)\sin\s(?P<range_start>\d)..(?P<range_stop>\d)\]')
  df_sub['var'] = df_sub['var'].str.split(',')
  df_sub['perms'] = pd.Series(df_sub.apply(gen_number_perms, axis=1))
  df = df_sub.insert(0, 'query', df['query'])
  return df_sub

def gen_number_perms(series):
  n = len(series['var'])
  return list(itertools.product(list(range(int(series['range_start']), int(series['range_stop']) + 1)), repeat=n))

def gen_query_perms(series):
  fragments = series['query']
  # print(fragments)
  queries = []
  for p in series['perms']:
    i = 0
    q = ""
    for col in fragments:
      if re.search(r'[mnpk]', f):
        if (p[i] == 0):
          q += f[:-2] +"-"
        elif (p[i] == 1):
          q += re.sub(r'[mnpk]', "-", f)
        else:
          q += re.sub(r'[mnpk]', str(p[i]) + "-", f)
        i += 1
      else:
        q += f + "-"
    queries.append(q)
    # print(q, p)
  return queries

def gen_query_perms_numeric(series):
  
  dataset = pd.read_excel('data/second_order.xlsx', sheet_name='Sheet2')
  first_order_groups = dataset['Group']
  first_order_groups.index = dataset['No']

  queries = []
  for p in series['perms']:
    i = 0
    q = ""
    for idx, v in series.iloc[:11].iteritems():
        if v is None:
          continue
        if re.search(r'[mnpk]', v):
          if (p[i] == 0):
            q += v[:-2]
          elif (p[i] == 1):
            group_name = re.sub(r'[mnpk]', "", v)
            q += group_name
          else:
            group_name = re.sub(r'[mnpk]', str(p[i]), v)
            q += group_name
          i += 1
        else:
          q += v
    queries.append(q)
  # print(queries)
  return queries

def recombine():
  pass



df = map_ranges(no_semi)

def replace_vars(series):
  pass



print('Number of groups with parentheses: ', len(df[df['query'].str.contains('\(')]))
parens = df[df['query'].str.contains('\(')]
omitted += len(df[df['query'].str.contains('\(')])
df = df[~df['query'].str.contains('\(')]
print('Number of groups without dashes: ', len(df[~df['query'].str.contains('[\-=]', regex=True)]))
df = df[df['query'].str.contains('[\-=]', regex=True)]
omitted += len(df[~df['query'].str.contains('[\-=]', regex=True)])
exploded_query = df['query'].str.split(r'(?P<sep>[\-=|])', regex=True, expand=True)
df = pd.concat([exploded_query, df['perms']], axis=1)
# print(df)
df = df.apply(gen_query_perms_numeric, axis=1)
df = df.fillna('')

# print(df)

dataset = pd.read_excel('data/second_order.xlsx', sheet_name='Sheet2')
first_order_groups = dataset['Group']
first_order_groups.index = dataset['No']

extra_replacements = {'C=CH': 'CH=C',
                      'C=CH2': 'CH2=C',
                      'O': '-O-',
                      'CH-SH': 'CHSH',
                      'CH2-SH': 'CH2SH',
                      'CH-O-': 'CH-O',
                      'CH-S-':'CHS',
                      'CH2-S-':'CH2S',
                      'CH=CH2':'CH2=CH',
                      '>Ncyc': 'N (cyclic)',
                      'CH-CN': 'CHCN',
                      'CH-CO-': 'CHCO',
                      'N-CH3': 'CH3N',
                      'N-CH2': 'CH2N',
                      'CH2-NO2': 'CH2NO2',
                      'CH-NO2': 'CHNO2',
                      'COO': 'COO except as above',
                      'CH2-O-': 'CH2O',
                      'CH2-CN': 'CH2CN',
                      'CH-CN': 'CHCN',
                      'CH2-CO-': 'CH2CO',
                      'N-CH': 'CH-N'
                      }


def safe_replace(rpl):
  if len(first_order_groups.loc[first_order_groups == rpl].index) != 0:
    return str(first_order_groups.loc[first_order_groups == rpl].index[0])
  if len(first_order_groups.loc[first_order_groups == rpl + "-"].index) != 0:
    return str(first_order_groups.loc[first_order_groups == rpl + "-"].index[0]) 
  if len(first_order_groups.loc[first_order_groups ==  "-" + rpl].index) != 0:
    return str(first_order_groups.loc[first_order_groups == "-" + rpl ].index[0])
  if rpl in list(extra_replacements.keys()):
    return str(first_order_groups.loc[first_order_groups == extra_replacements[rpl]].index[0])
  return rpl
  


for i,item in df.iteritems():
  for j,v in enumerate(item):
    frags = re.split(r'[|]', v)
    for k,f in enumerate(frags):
      frags[k] = safe_replace(f)
    v = "(" + ','.join(frags) + ")"
    item[j] = v



forbidden = ['CH3=CH', 'CH2=CH2', 'C=CH3', 'CH2=CH3', 
'CH3=CH2', 'CH=CH3', 'CH3=CH', 'CH3=CH2', 
'CH3=CH3','CH3=C','C=CH3', 'CH3=CH']

results = []
for group in list(df.index):
  tmp = df[group].copy()
  for combo in tmp:
    for el in combo[1:-1].split(','):
      # print(combo, el), print(forbidden)
      if el in forbidden:
        df[group].remove(combo)
        break

df


# df.append(df.iloc[:,:10].agg(''.join), ignore_index=True)



# print('Total groups omitted: ', omitted)



In [None]:
import re
parens


In [None]:
regex1 = r'^(\(CHn=C\))\(?(?:\(?(cyc)\)?)-(.*)$'
r1 = parens['query'].str.extract(regex1).dropna()
r1[0] = r1[0] + r1[1]
r1[1] = r1[2]
r1.drop(columns=2)

In [None]:
regex2 = r'^(?P<main1>CHm)\((?P<side1>.*)\)(?P<main2>CHn|\|COOH)(?:\((?P<side2>.*?)\))?$'
r2 = parens['query'].str.extract(regex2).dropna(how='all')
r2['main2'] = r2['main2']
r2.insert(1,'sep1','-')
r2.insert(4,'sep2','-')
r2.insert(3, 'sep', '|')
r2

In [None]:
import pandas as pd
df = pd.read_excel('data/second_order.xlsx', sheet_name='Sheet1')

df.index = df['No']
groups = df['Group']
omitted = 0



In [None]:
alternations = df[df['Group'].str.contains(r'aC')].iloc[:,:2]
alternations

In [None]:
var_mult = df[df['Group'].str.contains(r'[mnpk]')].iloc[:,:2]
pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None) 
var_mult

In [None]:
aromatics = df[df['Group'].str.contains(r'aC')].iloc[:,:2]
aromatics

In [None]:
aroring = df[df['Group'].str.contains(r'(AROMRING|PYRIDINE)')].iloc[:,:2]
aroring

In [None]:
cyc = df[df['Group'].str.contains(r'cyc')].iloc[:,:2]
cyc

In [1]:
import pandas as pd
df = pd.read_excel('data/second_order.xlsx', sheet_name='Sheet1')

df.index = df['No']
groups = df['Group']
omitted = 0
from gnn.preprocessing import *
parens = filter_regex(df, r'cyc')
len(parens)

37

In [2]:
other(df)

  return df[df['Group'].str.contains(regex)].iloc[:,:2]


Unnamed: 0_level_0,No,Group,nodes,edges
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
221,221,CH3|CH|CH3,"(1,1,3)","(1,3),(2,3)"
230,230,CH|CHO or C|CHO,"(3,38),(4,38)","(1,2)"
231,231,CH3CO|CH2,"(33,2)","(1,2)"
232,232,CH3CO|CH or CH3CO|C,"(33,3),(33,4)","(1,2)"
233,233,CH|COOH or C|COOH,"(3,31),(4,31)","(1,2)"
234,234,CH3|COOCH or CH3|COOC,"(40,3),(40,4)","(1,2)"
235,235,CO|O|CO,"(52,184,52)","(1,2),(2,3)"
236,236,CH|OH,"(3,29)","(1,2)"
237,237,C|OH,"(4,29)","(1,2)"
239,239,NC|CHOH or NC|COH,"(69,29),(69,29)","(1,2)"
