In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import json
import requests
import time
from tqdm import tqdm
import plotly.graph_objects as go

To classify chemicals found in Amla, we used [ClassyFire API](https://github.com/JamesJeffryes/pyclassyfire/blob/master/pyclassyfire/client.py).

In [2]:
url = "http://classyfire.wishartlab.com"

def get_entity(inchikey, return_format="json"):
    """Given a InChIKey for a previously queried structure, fetch the
     classification results.
    :param inchikey: An InChIKey for a previously calculated chemical structure
    :type inchikey: str
    :param return_format: desired return format. valid types are json, csv or sdf
    :type return_format: str
    :return: query information
    :rtype: str    
    """
    
    inchikey = inchikey.replace('InChIKey=', '')
    r = requests.get('%s/entities/%s.%s' % (url, inchikey, return_format),
                     headers={"Content-Type": "application/%s" % return_format})
    r.raise_for_status()
    return json.loads(r.text)

In [3]:
df = pd.read_csv('data/Gooseberry_chemical_list.csv')
cf = []
i = 1 
for chem in tqdm(df.inchikey.unique()):
    res = get_entity(chem)
    cf.append([chem, res['kingdom'], res['superclass'], res['class']])
    if int(i/10) == i/10:
        time.sleep(5)
    i += 1    
cf = pd.DataFrame(cf, columns=['inchikey', 'kingdom', 'superclass', 'class'])
cf['kingdom'] = cf.kingdom.apply(lambda x: x['name'] if x else np.nan)
cf['superclass'] = cf.superclass.apply(lambda x: x['name'] if x else np.nan)
cf['class'] = cf['class'].apply(lambda x: x['name'] if x else np.nan)
#cf.to_csv('data/classyfire_results.csv', index=False)
cf.head(1)

100%|██████████| 11/11 [00:07<00:00,  1.38it/s]


Unnamed: 0,inchikey,kingdom,superclass,class
0,JQQBXPCJFAKSPG-SVYIMCMUSA-N,Organic compounds,Phenylpropanoids and polyketides,Tannins


In [4]:
cf.kingdom.nunique(), cf.superclass.nunique(), cf['class'].nunique()

(1, 4, 6)

In [5]:
cf.superclass.unique()

array(['Phenylpropanoids and polyketides', 'Benzenoids',
       'Organic acids and derivatives', 'Organoheterocyclic compounds'],
      dtype=object)

In [6]:
df = cf.groupby('class').size().reset_index(name='counts')

layout = go.Layout(margin=go.layout.Margin(l=0, r=0, b=0, t=0), font_size=11, width=400, height=400, legend=dict(x=0.5, y=1.4)) 
#showlegend=False)

fig = go.Figure(data=[go.Pie(labels=df['class'], values=df.counts, textinfo='value+percent',
                            insidetextorientation='radial', pull=[0., 0., 0., 0.2, 0., 0.2], hole=.3)], layout=layout)

fig.update_traces(marker=dict(colors=px.colors.qualitative.Prism))
fig.show()
#fig.write_image('figures/chemical_class.svg')