# USP Inhibition

#### Imports

In [1]:
from os import path as op
import inspect
import pandas as pd
from smdt.descriptors import descriptors
from functools import reduce

#### Data

In [2]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'small-molecule-design-toolkit')
path = op.join(path, 'datasets')
path = op.join(path, 'usp inhibition')
df_x = pd.read_csv(op.join(path,'SMILES.csv'),index_col='Unnamed: 0')
df_y = pd.read_csv(op.join(path,'Activity.csv'),index_col='Unnamed: 0')

In [3]:
df_x.head(10)

Unnamed: 0,SMILES
0,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
1,C1=CC(=C(C(=C1)O)O)C(=O)O
2,C1C(C(C(OC1O)CO)O)O
3,C1=CC(=C(C=C1C(=O)O)O)O
4,C1=CC(=C(C(=C1)O)N)C(=O)O
5,C(C(C(=O)O)N)S(=O)O
6,C(CC(=O)O)CN
7,C1=CC(=CC=C1CC(=O)O)O
8,C(CC(=O)O)C(=O)CN
9,C(CO)O


In [4]:
df_y.head(10)

Unnamed: 0,Activity_Score
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


In [5]:
df_y.describe()

Unnamed: 0,Activity_Score
count,389492.0
mean,0.782091
std,3.934012
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,100.0


In [6]:
df_y[df_y['Activity_Score'].isnull()].shape[0]

66

#### Descriptor Calculations

In [7]:
bottleneck_rows = [430,434,524,655,660,667,674,677,679,680,690,691,696,697,701,702,703,705,708,709,718,720,727,730,733,734,735,
                   738,739,743,745,750,755,760,764,771,779,785,826,859,939,994,995,997,1007,1026,1031,1033,1038,1043,1050,1052,
                  1053,1054,1058,1059,1069,1076,1086,1088,1096,1097,1098,1100,1107,1108,1116,1117,1121,1128,1168,1183,1188,1200,
                  1201,1202,1203,1206,1212,1213,1224,1230,1231,1236,1281,1283,1309,1327,1334,1346,1367,1375,1381,1405,1414,1441,
                  1451,1462,1477,1491,1501]
df_x.drop(bottleneck_rows,inplace=True)
df_y.drop(bottleneck_rows,inplace=True)

In [8]:
molecular_descriptors = descriptors.Calculate_AllDescriptors(df_x[:100])
target_activity = df_y[:100]

In [9]:
molecular_descriptors

Unnamed: 0,AW,Arto,BertzCT,DZ,GMTI,Geto,Hato,Ipc,J,MZM1,...,ATSp7,ATSp8,ATSv1,ATSv2,ATSv3,ATSv4,ATSv5,ATSv6,ATSv7,ATSv8
0,3.077,2.000,2.579,31.333,2.911,1.791,1.592,2.830,3.267,6.306,...,0.000,0.000,2.381,2.664,2.608,2.196,1.642,1.122,0.000,0.000
1,2.655,2.000,2.466,26.000,2.696,1.801,1.610,2.440,3.274,5.194,...,0.000,0.000,2.307,2.512,2.386,1.809,0.935,0.000,0.000,0.000
2,2.655,2.000,2.108,27.000,2.693,1.801,1.610,2.446,2.512,5.194,...,0.000,0.000,2.205,2.386,2.291,1.768,0.832,0.000,0.000,0.000
3,2.764,2.000,2.466,26.000,2.717,1.801,1.610,2.434,3.152,5.194,...,0.000,0.000,2.307,2.512,2.338,1.720,1.118,0.421,0.000,0.000
4,2.655,2.000,2.471,25.500,2.696,1.801,1.610,2.440,3.274,5.194,...,0.000,0.000,2.325,2.542,2.443,1.868,0.935,0.000,0.000,0.000
5,2.667,1.778,2.133,22.500,2.394,1.558,1.385,1.782,3.632,5.583,...,0.000,0.000,1.934,1.953,1.723,1.348,0.717,0.000,0.000,0.000
6,2.476,1.714,1.797,16.500,2.114,1.575,1.448,1.463,2.822,3.861,...,0.000,0.000,1.744,1.606,1.313,1.000,0.537,0.000,0.000,0.000
7,3.018,2.000,2.399,25.000,2.770,1.849,1.692,2.456,2.736,4.583,...,0.346,0.000,2.355,2.510,2.307,1.881,1.517,0.930,0.421,0.000
8,2.889,1.778,2.084,21.500,2.447,1.608,1.459,1.895,3.276,4.972,...,0.000,0.000,1.978,1.946,1.720,1.442,1.176,0.537,0.000,0.000
9,1.667,1.500,0.778,10.000,1.279,1.414,1.333,0.836,1.975,2.500,...,0.000,0.000,1.106,0.705,0.233,0.000,0.000,0.000,0.000,0.000


#### Feature Selection

In [10]:
from smdt import feature_selection
selected_features, df_x_new = feature_selection.tree_based_feature_selection(molecular_descriptors,target_activity,100)

In [11]:
selected_features

['CATS_AN3',
 'J',
 'GMTI',
 'Mac',
 'MATSv6',
 'Smin33',
 'Smax33',
 'MRVSA4',
 'GATSm4',
 'EstateVSA2',
 'CATS_AP6',
 'CATS_NL3',
 'MATSm5',
 'MATSm3',
 'QHmax',
 'slogPVSA5',
 'Smin23',
 'Chi6ch',
 'Tac',
 'CATS_LL5',
 'QHss',
 'Smin36',
 'MATSe5',
 'S12',
 'PEOEVSA12',
 'QHmin',
 'nnitro',
 'GATSp3',
 'MATSe2',
 'MATSv2',
 'GATSp4',
 'S37',
 'bcutm1',
 'slogPVSA2',
 'GATSm1',
 'BertzCT',
 'GATSe5',
 'kappam2',
 'Smin16',
 'bcutp1',
 'bcutm4',
 'Chi4',
 'S21',
 'nhyd',
 'MATSe7',
 'GATSp5',
 'CATS_LL3',
 'Pol',
 'CATS_LL6',
 'CATS_DP6',
 'CATS_AA2',
 'CATS_PN5',
 'Smax36',
 'MATSe4',
 'Rpc',
 'GATSe6',
 'CATS_AN2',
 'Smin46',
 'Smax9',
 'Smin1',
 'Smin10',
 'Smin3',
 'Smin55',
 'Smin45',
 'Smin56',
 'Smin44',
 'Smin11',
 'Smin43',
 'Smax8',
 'Smax76',
 'Smax78',
 'Smax77',
 'Smin13',
 'Smin47',
 'Smax75',
 'Smax74',
 'Smax73',
 'Smin48',
 'Smin49',
 'Smin5',
 'Smax72',
 'Smax71',
 'Smax70',
 'Smin12',
 'Smin42',
 'Smin14',
 'Smin15',
 'Smin30',
 'Smin29',
 'Smin28',
 'Smin51',
 'Smi

In [12]:
df_x_new

Unnamed: 0,CATS_AN3,J,GMTI,Mac,MATSv6,Smin33,Smax33,MRVSA4,GATSm4,EstateVSA2,...,Smin51,Smin27,Smin26,Smin25,Smin24,Smin52,Smin31,Smin32,Smin22,Smin34
0,0.000,3.267,2.911,0.142,0.765,0.000,0.000,0.000,1.297,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,10.227
1,0.250,3.274,2.696,0.218,0.000,8.419,8.937,0.000,1.375,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,10.303
2,0.000,2.512,2.693,0.164,0.000,8.590,9.109,31.023,1.100,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,0.000
3,0.000,3.152,2.717,0.211,1.750,8.407,8.836,0.000,1.719,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,10.272
4,0.000,3.274,2.696,0.194,0.000,8.499,8.961,0.000,1.579,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,10.383
5,0.000,3.632,2.394,0.198,0.000,8.060,8.101,6.042,1.198,0.000,...,-2.122,0.0,0,0,0.000,0.000,0.000,0,0,9.864
6,0.000,2.822,2.114,0.138,0.000,7.989,7.989,12.842,1.658,12.966,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,9.703
7,0.000,2.736,2.770,0.150,-1.000,8.374,8.843,6.421,1.432,5.563,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,10.191
8,0.000,3.276,2.447,0.151,0.619,8.065,8.065,12.842,1.377,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,9.827
9,0.000,1.975,1.279,0.158,0.000,7.625,7.625,0.000,0.000,0.000,...,0.000,0.0,0,0,0.000,0.000,0.000,0,0,0.000


#### Models

In [13]:
from smdt import models

In [14]:
linear_model_metric = models.build_linear(molecular_descriptors,target_activity,50)
linear_model_metric

-23.273805165710339

In [15]:
random_forest_metric = models.build_random_forest(molecular_descriptors,target_activity,50)
random_forest_metric

-0.20173705466322955

In [16]:
lasso_metric = models.build_lasso(molecular_descriptors,target_activity,50)
lasso_metric

-0.02209330872245303

In [17]:
ridge_metric = models.build_ridge(molecular_descriptors,target_activity,50)
ridge_metric

-0.82966671233234091

In [18]:
elastic_net_metric = models.build_elastic_net(molecular_descriptors,target_activity,50)
elastic_net_metric

-0.052762899813601917

In [19]:
linear_SVR_metric = models.build_linear_SVR(molecular_descriptors,target_activity,50)
linear_SVR_metric

-0.058719697959271587

# Sweetness

#### Data

In [20]:
path= op.dirname(op.dirname(op.abspath(inspect.getfile(inspect.currentframe()))))
path = op.join(path, 'small-molecule-design-toolkit')
path = op.join(path, 'datasets')
path = op.join(path, 'sweetness')
df_x = pd.read_csv(op.join(path,'sweet.csv'))
df_x = pd.DataFrame(df_x[['SMILES','Sweetness']])
df_x.dropna(inplace=True)

df_y = pd.DataFrame(df_x['Sweetness'])
df_x = pd.DataFrame(df_x['SMILES'])

bottleneck_rows = [2,7,8,18,53,102,105,106,107,108,136,160,8141,8159,8169]
df_x.drop(bottleneck_rows,inplace=True)
df_y.drop(bottleneck_rows,inplace=True)

"""
new = pd.DataFrame()
for i in list(df_x.index)[170:]:
    print(i)
    data = pd.DataFrame(df_x.loc[i])
    data.columns=['SMILES']
    data.reset_index(inplace=True)
    new = descriptors.Calculate_AllDescriptors(data)
"""

Sweet_list = []
for i in df_y.index:
    k = df_y['Sweetness'][i]
    k = k.replace(' ','')
    k = k.replace(',',".")
    k = k.replace('-',",")
    k = k.split(",")
    if k == ['']:
        k = ['0']
    new_list = []
    for item in k:
        new_list.append(float(item))
    k = reduce(lambda x, y: x + y, new_list) / len(new_list)
    Sweet_list.append(k)
    
df_y['Sweetness'] = Sweet_list

In [21]:
df_x.head(10)

Unnamed: 0,SMILES
0,O(C(=O)[C@@H](NC(=O)[C@@H](N)CC(=O)O)Cc1ccccc1)C
1,O1[C@H](O[C@@H]([C@H](O)[C@@H](O)CO)[C@H](O)CO...
4,O(C(=O)C(NC(=O)C(N)CC(=O)O)Cc1ccccc1)C
13,O1[C@@H](OC(C(O)C(O)CO)C(O)CO)[C@H](O)[C@@H](O...
15,S(=O)(=O)([O-])NC1CCCCC1
16,S1(=O)(=O)N=C([O-])c2c1cccc2
17,O([C@@]12C[C@@]3(C([C@@]4(C(CC3)[C@](CCC4)(C)C...
20,ClC1C(O)C(O)C(OC2(OC(C(O)C2O)CCl)CCl)OC1CO
21,S1(=O)(=O)NC(=O)c2c1cccc2
25,O(C(=O)[C@H](NC(=O)[C@H](N)CC(=O)O)Cc1ccccc1)C


In [22]:
df_y.head(10)

Unnamed: 0,Sweetness
0,160.0
1,0.4
4,160.0
13,0.4
15,30.0
16,300.0
17,250.0
20,600.0
21,300.0
25,160.0


In [23]:
df_y.describe()

Unnamed: 0,Sweetness
count,184.0
mean,302.081467
std,1347.487578
min,0.0
25%,0.175
50%,0.75
75%,0.75
max,8000.0


In [24]:
df_y[df_y['Sweetness'].isnull()].shape[0]

0

#### Descriptor Calculations

In [25]:
molecular_descriptors = descriptors.Calculate_AllDescriptors(df_x)
target_activity = df_y

In [None]:
molecular_descriptors

#### Feature Selection

In [27]:
from smdt import feature_selection
selected_features, df_x_new = feature_selection.tree_based_feature_selection(molecular_descriptors,target_activity,100)

In [28]:
selected_features

['kappam3',
 'kappa3',
 'S35',
 'CATS_NL7',
 'CATS_AL7',
 'S24',
 'CATS_NL9',
 'CATS_PL5',
 'LabuteASA',
 'MATSm8',
 'MATSm7',
 'MATSe8',
 'MATSp8',
 'GATSe8',
 'MATSp7',
 'MATSe7',
 'GATSv7',
 'MATSv8',
 'S7',
 'Chiv4pc',
 'CATS_AL3',
 'slogPVSA0',
 'CATS_AL8',
 'CATS_LL9',
 'bcutp15',
 'Chiv0',
 'Smin6',
 'EstateVSA7',
 'CATS_AP3',
 'CATS_AP4',
 'Ipc',
 'Tigdi',
 'Sito',
 'CATS_PP4',
 'J',
 'QCss',
 'EstateVSA3',
 'GATSm2',
 'Smin35',
 'GATSe6',
 'CATS_AA8',
 'ATSm7',
 'MATSp2',
 'MATSm2',
 'CATS_DP4',
 'CATS_AL2',
 'CATS_LL0',
 'Qmax',
 'MRVSA9',
 'Smin18',
 'Smax11',
 'slogPVSA3',
 'MRVSA4',
 'slogPVSA5',
 'CATS_LL8',
 'PC2',
 'MRVSA6',
 'Smin34',
 'petitjeant',
 'PEOEVSA7',
 'VSAEstate9',
 'CATS_DD7',
 'Smin11',
 'CATS_AA3',
 'Smin15',
 'nsb',
 'CATS_AN7',
 'CATS_AA5',
 'CATS_PL2',
 'CATS_AN1',
 'QOmax',
 'S16',
 'S9',
 'S11',
 'Smax10',
 'noxy',
 'bcutp6',
 'CATS_DA0',
 'ATSv4',
 'AW',
 'MATSp1',
 'Smin12',
 'slogPVSA1',
 'Smax6',
 'CATS_LL4',
 'GATSv5',
 'EstateVSA2',
 'CATS_DL9

In [29]:
df_x_new

Unnamed: 0,kappam3,kappa3,S35,CATS_NL7,CATS_AL7,S24,CATS_NL9,CATS_PL5,LabuteASA,MATSm8,...,MATSe2,CATS_LL3,CATS_AL5,Smin23,GATSp4,MATSm5,ATSm3,CATS_AL4,bcutm12,Chi4pc
0,5.616,7.200,34.024,0.067,0.176,2.420,0.133,0.125,121.633,0.650,...,0.205,0.139,0.147,2.420,1.109,0.311,3.492,0.088,1.409,2.765
1,4.776,4.989,0.000,0.000,0.000,0.000,0.000,0.000,131.123,-0.077,...,-0.336,0.000,0.000,0.000,0.986,0.088,3.998,0.000,1.439,4.987
2,5.616,7.200,34.024,0.067,0.176,2.420,0.133,0.125,121.633,0.650,...,0.205,0.139,0.147,2.420,1.109,0.311,3.492,0.088,1.409,2.765
3,4.776,4.989,0.000,0.000,0.000,0.000,0.000,0.000,131.123,-0.077,...,-0.336,0.000,0.000,0.000,0.986,0.088,3.998,0.000,1.439,4.987
4,3.691,3.787,20.466,0.000,0.000,2.075,0.000,0.000,66.872,0.000,...,0.805,0.133,0.400,2.075,1.197,-0.311,2.752,0.400,0.860,1.003
5,0.939,1.331,22.152,0.000,0.000,0.000,0.000,0.000,69.241,0.000,...,0.262,0.133,0.150,0.000,0.878,-0.110,3.328,0.400,0.813,2.802
6,6.914,7.351,14.082,0.056,0.200,0.000,0.000,0.000,324.091,-0.159,...,-0.026,0.222,0.124,0.000,0.798,0.123,5.014,0.071,1.829,16.629
7,3.584,3.383,0.000,0.000,0.130,0.000,0.000,0.000,146.964,0.048,...,-0.304,0.000,0.217,0.000,1.079,-0.052,4.352,0.174,1.427,6.103
8,0.939,1.331,33.254,0.000,0.000,1.892,0.000,0.000,69.263,0.000,...,0.262,0.133,0.200,1.892,0.878,-0.110,3.328,0.400,0.915,2.802
9,5.616,7.200,34.024,0.067,0.176,2.420,0.133,0.125,121.633,0.650,...,0.205,0.139,0.147,2.420,1.109,0.311,3.492,0.088,1.409,2.765


#### Models

In [30]:
from smdt import models

In [40]:
linear_model_metric = models.build_linear(molecular_descriptors,target_activity,10)
linear_model_metric

-63517.789787756032

In [41]:
random_forest_metric = models.build_random_forest(molecular_descriptors,target_activity,10)
random_forest_metric

-2.6302408411748974

In [42]:
lasso_metric = models.build_lasso(molecular_descriptors,target_activity,10)
lasso_metric

-18713.350920324818

In [43]:
ridge_metric = models.build_ridge(molecular_descriptors,target_activity,10)
ridge_metric

-773004.25159976515

In [45]:
elastic_net_metric = models.build_elastic_net(molecular_descriptors,target_activity,10)
elastic_net_metric

-1953559.8743412122