# USP Inhibition

#### Imports

In [1]:
from os import path as op
import inspect
import pandas as pd
from smdt.descriptors import descriptors

#### Data

In [2]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'small-molecule-design-toolkit')
path = op.join(path, 'datasets')
path = op.join(path, 'usp inhibition')
df_x = pd.read_csv(op.join(path,'SMILES.csv'),index_col='Unnamed: 0')
df_y = pd.read_csv(op.join(path,'Activity.csv'),index_col='Unnamed: 0')

In [3]:
df_x.head(10)

Unnamed: 0,SMILES
0,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
1,C1=CC(=C(C(=C1)O)O)C(=O)O
2,C1C(C(C(OC1O)CO)O)O
3,C1=CC(=C(C=C1C(=O)O)O)O
4,C1=CC(=C(C(=C1)O)N)C(=O)O
5,C(C(C(=O)O)N)S(=O)O
6,C(CC(=O)O)CN
7,C1=CC(=CC=C1CC(=O)O)O
8,C(CC(=O)O)C(=O)CN
9,C(CO)O


In [4]:
df_y.head(10)

Unnamed: 0,Activity_Score
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


In [5]:
df_y.describe()

Unnamed: 0,Activity_Score
count,389492.0
mean,0.782091
std,3.934012
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,100.0


In [6]:
df_y[df_y['Activity_Score'].isnull()].shape[0]

66

#### Descriptor Calculations

In [7]:
bottleneck_rows = [430,434,524,655,660,667,674,677,679,680,690,691,696,697,701,702,703,705,708,709,718,720,727,730,733,734,735,
                   738,739,743,745,750,755,760,764,771,779,785,826,859,939,994,995,997,1007,1026,1031,1033,1038,1043,1050,1052,
                  1053,1054,1058,1059,1069,1076,1086,1088,1096,1097,1098,1100,1107,1108,1116,1117,1121,1128,1168,1183,1188,1200,
                  1201,1202,1203,1206,1212,1213,1224,1230,1231,1236,1281,1283,1309,1327,1334,1346,1367,1375,1381,1405,1414,1441,
                  1451,1462,1477,1491,1501]
df_x.drop(bottleneck_rows,inplace=True)
df_y.drop(bottleneck_rows,inplace=True)

In [8]:
molecular_descriptors = descriptors.Calculate_AllDescriptors(df_x[:100])
target_activity = df_y[:100]

In [9]:
molecular_descriptors

Unnamed: 0,AW,Arto,BertzCT,DZ,GMTI,Geto,Hato,Ipc,J,MZM1,...,ATSp7,ATSp8,ATSv1,ATSv2,ATSv3,ATSv4,ATSv5,ATSv6,ATSv7,ATSv8
0,3.077,2.000,2.579,31.333,2.911,1.791,1.592,2.830,3.267,6.306,...,0.000,0.000,2.381,2.664,2.608,2.196,1.642,1.122,0.000,0.000
1,2.655,2.000,2.466,26.000,2.696,1.801,1.610,2.440,3.274,5.194,...,0.000,0.000,2.307,2.512,2.386,1.809,0.935,0.000,0.000,0.000
2,2.655,2.000,2.108,27.000,2.693,1.801,1.610,2.446,2.512,5.194,...,0.000,0.000,2.205,2.386,2.291,1.768,0.832,0.000,0.000,0.000
3,2.764,2.000,2.466,26.000,2.717,1.801,1.610,2.434,3.152,5.194,...,0.000,0.000,2.307,2.512,2.338,1.720,1.118,0.421,0.000,0.000
4,2.655,2.000,2.471,25.500,2.696,1.801,1.610,2.440,3.274,5.194,...,0.000,0.000,2.325,2.542,2.443,1.868,0.935,0.000,0.000,0.000
5,2.667,1.778,2.133,22.500,2.394,1.558,1.385,1.782,3.632,5.583,...,0.000,0.000,1.934,1.953,1.723,1.348,0.717,0.000,0.000,0.000
6,2.476,1.714,1.797,16.500,2.114,1.575,1.448,1.463,2.822,3.861,...,0.000,0.000,1.744,1.606,1.313,1.000,0.537,0.000,0.000,0.000
7,3.018,2.000,2.399,25.000,2.770,1.849,1.692,2.456,2.736,4.583,...,0.346,0.000,2.355,2.510,2.307,1.881,1.517,0.930,0.421,0.000
8,2.889,1.778,2.084,21.500,2.447,1.608,1.459,1.895,3.276,4.972,...,0.000,0.000,1.978,1.946,1.720,1.442,1.176,0.537,0.000,0.000
9,1.667,1.500,0.778,10.000,1.279,1.414,1.333,0.836,1.975,2.500,...,0.000,0.000,1.106,0.705,0.233,0.000,0.000,0.000,0.000,0.000


#### Feature Selection

In [10]:
from smdt import feature_selection
selected_features, df_x_new = feature_selection.tree_based_feature_selection(molecular_descriptors,target_activity,100)

In [11]:
selected_features

['slogPVSA3',
 'Smax36',
 'Sito',
 'CATS_NL8',
 'CATS_DD0',
 'MATSe5',
 'MATSv5',
 'MATSm5',
 'GATSm5',
 'MATSv2',
 'CATS_DA5',
 'Smin15',
 'QHmin',
 'GATSp5',
 'GATSp6',
 'GATSv4',
 'Arto',
 'CATS_DN1',
 'Rpc',
 'bcute3',
 'CATS_DD6',
 'PEOEVSA7',
 'Chi3c',
 'J',
 'ATSm6',
 'Chi1',
 'PEOEVSA8',
 'Smin34',
 'CATS_PN5',
 'QOmax',
 'QNss',
 'Smin17',
 'Smin8',
 'knotpv',
 'GATSe5',
 'CATS_PL4',
 'Chi4pc',
 'MATSm6',
 'QCss',
 'Chiv6',
 'ZM1',
 'PEOEVSA9',
 'ATSe3',
 'ATSe8',
 'Smax20',
 'GATSv1',
 'Smax33',
 'GATSv5',
 'dchi0',
 'PEOEVSA2',
 'S37',
 'PEOEVSA0',
 'ATSe7',
 'CATS_DP6',
 'Chi5',
 'QHmax',
 'S16',
 'MTPSA',
 'S53',
 'LDI',
 'bcutm9',
 'Chi4',
 'EstateVSA5',
 'QCmin',
 'GATSp2',
 'CATS_LL6',
 'CATS_AA0',
 'mChi1',
 'GATSe1',
 'Smax52',
 'Smin12',
 'CATS_AA4',
 'Tnc',
 'Chiv0',
 'Mac',
 'CATS_DA3',
 'Smax53',
 'CATS_DP0',
 'noxy',
 'EstateVSA4',
 'MATSm3',
 'PEOEVSA6',
 'PC2',
 'kappam3',
 'slogPVSA4',
 'ATSv7',
 'EstateVSA2',
 'bcutm7',
 'S29',
 'ATSv5',
 'CATS_DL4',
 'CATS_D

In [12]:
df_x_new

Unnamed: 0,slogPVSA3,Smax36,Sito,CATS_NL8,CATS_DD0,MATSe5,MATSv5,MATSm5,GATSm5,MATSv2,...,CATS_DL4,CATS_DL6,bcute9,Platt,Weight,naccr,W,CATS_LL2,Smax6,bcutp1
0,20.229,0.000,3.289,0.000,0.000,-0.494,-0.285,-0.119,0.755,0.189,...,0.000,0.000,1.845,36,199.529,4,240.0,0.200,0.000,3.816
1,0.000,0.000,2.812,0.000,0.500,0.375,0.375,0.375,0.982,0.021,...,0.250,0.000,1.839,30,148.073,3,146.0,0.200,0.000,3.806
2,0.000,0.000,2.812,0.000,0.400,0.650,0.650,0.650,0.458,-0.242,...,0.000,0.000,1.687,30,152.061,5,146.0,0.000,0.000,3.593
3,0.000,0.000,2.812,0.000,0.500,0.100,0.100,0.100,1.179,0.021,...,0.250,0.000,1.839,30,148.073,3,152.0,0.200,0.000,3.806
4,0.000,0.000,2.812,0.000,0.500,0.644,0.572,0.638,1.170,0.080,...,0.250,0.000,1.851,30,146.081,3,146.0,0.200,0.000,3.816
5,0.000,0.000,1.732,0.000,0.500,1.120,1.035,0.002,0.000,-0.016,...,0.000,0.000,1.543,20,146.103,3,96.0,0.000,0.000,3.572
6,0.000,0.000,1.380,0.000,0.667,0.444,0.778,0.477,0.550,0.091,...,0.000,0.000,1.552,12,94.049,2,52.0,0.000,0.000,3.423
7,0.000,0.000,2.937,0.000,0.667,-0.771,-0.771,-0.771,1.910,0.146,...,0.250,0.000,1.855,28,144.085,2,166.0,0.286,0.000,3.812
8,0.000,0.000,1.857,0.000,0.667,0.225,0.133,0.217,0.953,-0.108,...,0.250,0.000,1.581,18,122.059,3,104.0,0.000,0.000,3.478
9,0.000,0.000,0.602,0.000,0.667,0.000,0.000,0.000,0.000,-1.000,...,0.000,0.000,1.329,4,56.020,2,10.0,0.000,0.000,3.154


#### Models

In [13]:
from smdt import models

In [38]:
linear_model_metric = models.build_linear(molecular_descriptors,target_activity,50)
linear_model_metric

-3.7999635675548524

In [37]:
random_forest_metric = models.build_random_forest(molecular_descriptors,target_activity,50)
random_forest_metric

-0.37946785643466197

In [36]:
lasso_metric = models.build_lasso(molecular_descriptors,target_activity,50)
lasso_metric

-0.098552963079481526

In [30]:
ridge_metric = models.build_ridge(molecular_descriptors,target_activity,50)
ridge_metric

-0.78518879445461331

In [27]:
elastic_net_metric = models.build_elastic_net(molecular_descriptors,target_activity,50)
elastic_net_metric

-0.15846551928292721

In [25]:
linear_SVR_metric = models.build_linear_SVR(molecular_descriptors,target_activity,50)
linear_SVR_metric

-0.06138140393046667

# Sweetness

#### Data

In [52]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'small-molecule-design-toolkit')
path = op.join(path, 'datasets')
path = op.join(path, 'sweetness')
df_x = pd.read_csv(op.join(path,'sweet.csv'))
df_y = pd.DataFrame(df_x['Sweetness'])
df_x = pd.DataFrame(df_x['SMILES'])

In [53]:
df_x

Unnamed: 0,SMILES
0,O(C(=O)[C@@H](NC(=O)[C@@H](N)CC(=O)O)Cc1ccccc1)C
1,O1[C@H](O[C@@H]([C@H](O)[C@@H](O)CO)[C@H](O)CO...
2,[Ca].S(=O)(=O)([O-])NC1CCCCC1
3,OC(C(O)C(O)C=O)C(O)CO.OC(C(O)CO)C(O)C(=O)CO
4,O(C(=O)C(NC(=O)C(N)CC(=O)O)Cc1ccccc1)C
5,[Ca+2].S1(=O)(=O)N=C([O-])c2c1cccc2.S1(=O)(=O)...
6,S(=O)(=O)([O-])NC1CCCCC1.[Na+]
7,S1(=O)(=O)N=C([O-])c2c1cccc2.[Na+]
8,[K+].S(=O)(=O)([O-])NC1CCCCC1
9,S1(=O)(=O)N=C([O-])c2c1cccc2.[Na+].O.O


In [54]:
df_y

Unnamed: 0,Sweetness
0,140 - 180
1,04
2,30-50
3,
4,140 - 180
5,
6,
7,300
8,50
9,


In [55]:
df_y.describe()

Unnamed: 0,Sweetness
count,214
unique,50
top,75
freq,74


In [57]:
df_y[df_y['Sweetness'].isnull()].shape[0]

7983