In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from skimpy import skim
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [141]:
target = new_client.target
target_query = target.search('Neuraminidase')
targets = pd.DataFrame.from_dict(target_query)

In [142]:
dataframes = []

In [143]:
for i in range(len(targets)):
    target_chembl_id = targets.target_chembl_id.iloc[i]
    target_name = targets.target_chembl_id.iloc[i]
    
    activity = new_client.activity
    activity_query = activity.filter(target_chembl_id=target_chembl_id)
    
    df = pd.DataFrame.from_dict(activity_query)
    df['target'] = target_name
    
    dataframes.append(df)

In [144]:
len(dataframes)

39

In [145]:
df = pd.concat(dataframes[:40], ignore_index=True)
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,target
0,,,72419,[],CHEMBL808321,inhibition of Influenza A Sialidase,B,,,BAO_0000192,...,Neuraminidase,211044,,,Ki,M,UO_0000065,,4e-06,CHEMBL2051
1,,,72420,[],CHEMBL808321,inhibition of Influenza A Sialidase,B,,,BAO_0000192,...,Neuraminidase,211044,,,Ki,M,UO_0000065,,1e-09,CHEMBL2051
2,,,89474,[],CHEMBL808321,inhibition of Influenza A Sialidase,B,,,BAO_0000192,...,Neuraminidase,211044,,,Ki,M,UO_0000065,,4e-08,CHEMBL2051
3,,,105745,[],CHEMBL882330,inhibitory concentration required to inhibit n...,B,,,BAO_0000190,...,Neuraminidase,211044,,,IC50,uM,UO_0000065,,40.0,CHEMBL2051
4,,,106883,[],CHEMBL882330,inhibitory concentration required to inhibit n...,B,,,BAO_0000190,...,Neuraminidase,211044,,,IC50,uM,UO_0000065,,0.1,CHEMBL2051


In [146]:
skim(df)

In [147]:
df = df.drop(["upper_value","toid","text_value","standard_upper_value","standard_text_value","data_validity_description",
             "data_validity_comment","molecule_pref_name","assay_variant_mutation","assay_variant_accession",
             "activity_comment","activity_properties","action_type","ligand_efficiency"],axis=1)

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   activity_id                7294 non-null   int64 
 1   assay_chembl_id            7294 non-null   object
 2   assay_description          7294 non-null   object
 3   assay_type                 7294 non-null   object
 4   bao_endpoint               7294 non-null   object
 5   bao_format                 7294 non-null   object
 6   bao_label                  7294 non-null   object
 7   canonical_smiles           7294 non-null   object
 8   document_chembl_id         7294 non-null   object
 9   document_journal           6123 non-null   object
 10  document_year              7294 non-null   int64 
 11  molecule_chembl_id         7294 non-null   object
 12  parent_molecule_chembl_id  7294 non-null   object
 13  pchembl_value              4450 non-null   object
 14  potentia

In [149]:
df.dropna()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,document_chembl_id,document_journal,...,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,type,units,uo_units,value,target
0,72419,CHEMBL808321,inhibition of Influenza A Sialidase,B,BAO_0000192,BAO_0000357,single protein format,CC(=O)NC1C(O)C=C(C(=O)O)OC1[C@H](O)[C@H](O)CO,CHEMBL1127812,J Med Chem,...,4000.0,CHEMBL2051,Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Neuraminidase,211044,Ki,M,UO_0000065,0.000004,CHEMBL2051
1,72420,CHEMBL808321,inhibition of Influenza A Sialidase,B,BAO_0000192,BAO_0000357,single protein format,CC(=O)NC1C(NC(=N)N)C=C(C(=O)O)OC1C(O)[C@H](O)CO,CHEMBL1127812,J Med Chem,...,1.0,CHEMBL2051,Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Neuraminidase,211044,Ki,M,UO_0000065,1E-9,CHEMBL2051
2,89474,CHEMBL808321,inhibition of Influenza A Sialidase,B,BAO_0000192,BAO_0000357,single protein format,CC(=O)NC1C(N)C=C(C(=O)O)OC1[C@H](O)[C@H](O)CO,CHEMBL1127812,J Med Chem,...,40.0,CHEMBL2051,Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Neuraminidase,211044,Ki,M,UO_0000065,4E-8,CHEMBL2051
3,105745,CHEMBL882330,inhibitory concentration required to inhibit n...,B,BAO_0000190,BAO_0000357,single protein format,CC(=O)N[C@@H]([C@H](O)[C@H](O)CO)[C@H]1OC(O)(C...,CHEMBL1133646,J Med Chem,...,40000.0,CHEMBL2051,Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Neuraminidase,211044,IC50,uM,UO_0000065,40.0,CHEMBL2051
4,106883,CHEMBL882330,inhibitory concentration required to inhibit n...,B,BAO_0000190,BAO_0000357,single protein format,CCCCC(NC(C)=O)[C@@H]1CC(C(=O)O)C[C@H]1N=C(N)N,CHEMBL1133646,J Med Chem,...,100.0,CHEMBL2051,Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Neuraminidase,211044,IC50,uM,UO_0000065,0.1,CHEMBL2051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6406,12100238,CHEMBL2176198,Inhibition of human recombinant Myc-His10-tagg...,B,BAO_0000190,BAO_0000219,cell-based format,COc1c(F)c(C(=O)N[C@@H](CC(=O)O)c2ccccc2C)nn1-c...,CHEMBL2169867,J Med Chem,...,30.0,CHEMBL6115,Homo sapiens,Lysosomal protective protein,9606,IC50,uM,UO_0000065,0.03,CHEMBL6115
6407,12100239,CHEMBL2176198,Inhibition of human recombinant Myc-His10-tagg...,B,BAO_0000190,BAO_0000219,cell-based format,Cc1ccccc1[C@H](CC(=O)O)NC(=O)c1cc(OCC2CC2)n(-c...,CHEMBL2169867,J Med Chem,...,15.0,CHEMBL6115,Homo sapiens,Lysosomal protective protein,9606,IC50,uM,UO_0000065,0.015,CHEMBL6115
6408,12100240,CHEMBL2176198,Inhibition of human recombinant Myc-His10-tagg...,B,BAO_0000190,BAO_0000219,cell-based format,CCOc1cc(C(=O)N[C@@H](CC(=O)O)c2ccccc2C)nn1-c1c...,CHEMBL2169867,J Med Chem,...,15.0,CHEMBL6115,Homo sapiens,Lysosomal protective protein,9606,IC50,uM,UO_0000065,0.015,CHEMBL6115
6409,12100241,CHEMBL2176198,Inhibition of human recombinant Myc-His10-tagg...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc(C(=O)N[C@@H](CC(=O)O)c2ccccc2C)nn1-c1cc...,CHEMBL2169867,J Med Chem,...,26.0,CHEMBL6115,Homo sapiens,Lysosomal protective protein,9606,IC50,uM,UO_0000065,0.026,CHEMBL6115


In [150]:
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.RDKFingerprint(mol)
    else:
        return None

In [151]:
df['fingerprint'] = df['canonical_smiles'].apply(smiles_to_mol)

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   activity_id                7294 non-null   int64 
 1   assay_chembl_id            7294 non-null   object
 2   assay_description          7294 non-null   object
 3   assay_type                 7294 non-null   object
 4   bao_endpoint               7294 non-null   object
 5   bao_format                 7294 non-null   object
 6   bao_label                  7294 non-null   object
 7   canonical_smiles           7294 non-null   object
 8   document_chembl_id         7294 non-null   object
 9   document_journal           6123 non-null   object
 10  document_year              7294 non-null   int64 
 11  molecule_chembl_id         7294 non-null   object
 12  parent_molecule_chembl_id  7294 non-null   object
 13  pchembl_value              4450 non-null   object
 14  potentia