In [5]:
import pandas as pd
import cirpy as cpy
import numpy as np

cpy.resolve('1-AZOXYPROPANE', 'smiles')

smiles = [] 
for index, row in df.iterrows(): 
    smile = cpy.resolve(row['name'], 'smiles')
    smiles.append(smile)

from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem.rdmolfiles import SmilesWriter
from rdkit.Chem.rdmolfiles import SDWriter
from rdkit.Chem.Fingerprints import FingerprintMols

def fingerprint(input_df):
    '''From the input dataframe, makes a list of rdkit Mol objects and makes a
    list of rdkit fingerprints generated from those Mol objects. Inserts both
    lists as new columns and returns the expanded dataframe.'''
    
    mol_list = []
    fp_list = []

    for index, row in input_df.iterrows():
        mol = Chem.rdmolfiles.MolFromSmiles(row['SMILES'])
        if not mol: 
            mol_list.append('None')
            fp_list.append('None')
            continue
        mol_list.append(mol) #get mols from SMILES and add mols to list
        fp = FingerprintMols.FingerprintMol(mol)
        fp_list.append(mol) #get fingerprints from mols and and fingerprints to list

    input_df['Mol'] = mol_list
    input_df['Fingerprint'] = fp_list

    return input_df

testmol = Chem.rdmolfiles.MolFromSmiles('Cc1cc(C)c(N)cc1C')

mols = [mol for mol in neg['Mol']]
writer = SDWriter('../../big datasets/neg.sdf')
for mol in mols:
    writer.write(mol)
writer.close()

from natsort import natsorted, index_natsorted, order_by_index

In [87]:
drugall = pd.read_csv('../../big datasets/drug_data_to_use.csv')

In [88]:
drug_neg = drugall.iloc[1017:,:]

In [89]:
classtox = []

for index, row in drug_neg.iterrows():
    classtox.append(0)
    
drug_neg.insert(loc=1, column='Class', value=classtox)

In [90]:
drug_neg.head()

Unnamed: 0.2,Unnamed: 0,Class,Unnamed: 0.1,nAcid,ALogP,ALogp2,AMR,nN,nO,nS,...,JGI7,JGI8,JGI9,JGI10,VE1_D,VR1_D,WTPT-1,WTPT-2,WTPT-3,XLogP
1017,0,0,0,0.0,2.4441,5.973625,31.2668,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1082116,17.712,10.46599,1.744332,9.237437,2.811
1018,1,0,0,0.0,2.3992,5.756161,30.7638,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.5511150000000004e-17,15.31854,10.523646,1.753941,9.301424,2.576
1019,2,0,0,0.0,1.9555,3.82398,25.7097,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.06081016,12.37595,8.696802,1.73936,6.95522,2.158
1020,3,0,0,0.0,1.5407,2.373756,20.6602,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.110223e-16,3000000.0,6.732051,1.683013,4.488034,1.816
1021,4,0,0,0.0,2.2057,4.865112,29.6691,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.220446e-16,3000004.0,10.675851,1.779308,7.109354,2.379


In [92]:
drug_pos = drugall.iloc[:1017,:]


drug_pos.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,nAcid,ALogP,ALogp2,AMR,nN,nO,nS,nP,...,JGI7,JGI8,JGI9,JGI10,VE1_D,VR1_D,WTPT-1,WTPT-2,WTPT-3,XLogP
0,0,65471,0.0,0.2842,0.08077,86.9785,0.0,4.0,0.0,0.0,...,0.021016,0.016358,0.012,0.012346,0.009428,230.520837,47.495419,2.065018,11.316324,2.665
1,1,13684,0.0,-0.6801,0.462536,94.5871,2.0,2.0,0.0,0.0,...,0.011177,0.018664,0.00776,0.01,0.163765,250.398458,47.828913,2.079518,13.788202,0.939
2,2,7909,0.0,1.046,1.094116,87.4012,0.0,4.0,0.0,0.0,...,0.019206,0.008902,0.010859,0.01374,0.124083,1645.934186,43.058016,2.050382,11.246419,2.072
3,3,3936,0.0,-0.3414,0.116554,64.9254,0.0,7.0,0.0,0.0,...,0.020991,0.008724,0.013333,0.0,0.169516,179.576794,36.989542,1.946818,18.859169,1.052
4,4,7060,0.0,1.6169,2.614366,70.6318,1.0,4.0,0.0,0.0,...,0.024706,0.013454,0.009896,0.012346,0.1471,143.314817,36.802051,1.93695,13.302624,1.45


In [93]:
classdrug = []

for index, row in drug_pos.iterrows():
    classdrug.append(1)
    
drug_pos.insert(loc=1, column='Class', value=classdrug)

In [94]:
drug_pos.head()

Unnamed: 0.2,Unnamed: 0,Class,Unnamed: 0.1,nAcid,ALogP,ALogp2,AMR,nN,nO,nS,...,JGI7,JGI8,JGI9,JGI10,VE1_D,VR1_D,WTPT-1,WTPT-2,WTPT-3,XLogP
0,0,1,65471,0.0,0.2842,0.08077,86.9785,0.0,4.0,0.0,...,0.021016,0.016358,0.012,0.012346,0.009428,230.520837,47.495419,2.065018,11.316324,2.665
1,1,1,13684,0.0,-0.6801,0.462536,94.5871,2.0,2.0,0.0,...,0.011177,0.018664,0.00776,0.01,0.163765,250.398458,47.828913,2.079518,13.788202,0.939
2,2,1,7909,0.0,1.046,1.094116,87.4012,0.0,4.0,0.0,...,0.019206,0.008902,0.010859,0.01374,0.124083,1645.934186,43.058016,2.050382,11.246419,2.072
3,3,1,3936,0.0,-0.3414,0.116554,64.9254,0.0,7.0,0.0,...,0.020991,0.008724,0.013333,0.0,0.169516,179.576794,36.989542,1.946818,18.859169,1.052
4,4,1,7060,0.0,1.6169,2.614366,70.6318,1.0,4.0,0.0,...,0.024706,0.013454,0.009896,0.012346,0.1471,143.314817,36.802051,1.93695,13.302624,1.45


In [95]:
drugall = drug_pos.append(drug_neg)

In [96]:
drugall.to_csv('../../big datasets/drugall_with_classes.csv')

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
class FakeSampler(BaseSampler):

    _sampling_type = 'bypass'

    def _fit_resample(self, X, y):
        return X, y


fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
sampler = FakeSampler()
clf = make_pipeline(sampler, LinearSVC())
plot_resampling(X, y, sampler, ax1)
ax1.set_title('Original data - y={}'.format(Counter(y)))

ax_arr = (ax2, ax3, ax4)
for ax, sampler in zip(ax_arr, (RandomOverSampler(random_state=0),
                                SMOTE(random_state=0),
                                ADASYN(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_resampling(X, y, sampler, ax)
    ax.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

In [97]:
drug_pos_features = drug_pos.iloc[:,2:]
drug_pos_features.head()
drug_pos_class = drug_pos['Class']

drug_neg_features = drug_neg.iloc[:,2:]
drug_neg_class = drug_neg['Class']

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [19]:
def encode(series): 
    return pd.get_dummies(series.astype(str))

In [115]:
x_train_neg, x_test_neg, y_train_neg, y_test_neg = train_test_split(drug_neg_features, drug_neg_class, test_size=0.2, random_state=12)

In [116]:
x_train, x_test, y_train, y_test = train_test_split(drug_pos_features, drug_pos_class, test_size=0.2, random_state=12)

In [117]:
x_train = x_train.append(x_train_neg)
x_test = x_test.append(x_test_neg)
y_train = y_train.append(y_train_neg)
y_test = y_test.append(y_test_neg)
all_train = x_train.join(y_train)
all_test = x_test.join(y_test)

In [76]:
len(y_test) == len(x_test)
len(x_train) == len(y_train)

True

y_train = encode(y_train) # one hot encode the class data
y_test = encode(y_test)

x_test.astype('float64')
x_test.fillna(0)

x_train.astype('float64')
x_train.fillna(0)


#dfs = (x_test_neg, x_train_neg)
#for df in dfs:
#    df.replace([np.inf, -np.inf], np.nan)
#    df.fillna(0)

dfs = (x_test_neg, x_train_neg)
for df in dfs:
    df = pd.DataFrame(df)
    df.reset_index()
    df.dropna

x_test = pd.DataFrame(x_test.replace([np.inf, -np.inf], np.nan))
x_test.fillna(0)
x_train = pd.DataFrame(x_train.replace([np.inf, -np.inf], np.nan))
x_train.fillna(0)


In [43]:
type(x_train)

pandas.core.frame.DataFrame

In [101]:
all_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 372 to 1172
Columns: 297 entries, Unnamed: 0.1 to Class
dtypes: float64(278), int64(2), object(17)
memory usage: 2.4+ MB


x_train.drop(x_train.select_dtypes(['object']), inplace=True, axis=1)
x_train.astype('float64')
x_train.drop(x_train.select_dtypes(['int64']), inplace=True, axis=1)

x_test.drop(x_test.select_dtypes(['object']), inplace=True, axis=1)
x_test.astype('float64')
x_test.drop(x_test.select_dtypes(['int64']), inplace=True, axis=1)



x_train.info(), x_test.info()

In [119]:
all_train.drop(all_train.select_dtypes(['object']), inplace=True, axis=1)
all_train.astype('float64')

all_test.drop(all_test.select_dtypes(['object']), inplace=True, axis=1)
all_test.astype('float64')

all_train.head(), all_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 372 to 1172
Columns: 280 entries, Unnamed: 0.1 to Class
dtypes: float64(278), int64(2)
memory usage: 2.3 MB


(     Unnamed: 0.1  nAcid   ALogP    ALogp2       AMR   nN   nO   nS   nP   nF  \
 372         55003    0.0 -2.0083  4.033269   83.5689  2.0  5.0  0.0  0.0  1.0   
 360         52834    0.0 -1.5431  2.381158   98.3173  3.0  4.0  1.0  0.0  0.0   
 799          9091    0.0  0.1530  0.023409   98.4411  2.0  3.0  0.0  0.0  0.0   
 263         82999    0.0 -3.0456  9.275679  102.8853  5.0  1.0  0.0  0.0  0.0   
 425         76285    0.0 -0.5105  0.260610   93.8320  2.0  3.0  0.0  0.0  0.0   
 
      ...      JGI8      JGI9     JGI10     VE1_D       VR1_D     WTPT-1  \
 372  ...  0.011339  0.009392  0.005648  0.004290  252.735571  49.913289   
 360  ...  0.013213  0.009712  0.008328  0.158715  300.918715  52.817771   
 799  ...  0.008273  0.009643  0.005663  0.066221  368.184630  49.516759   
 263  ...  0.005603  0.008608  0.005493  0.083425  344.368465  55.858018   
 425  ...  0.019596  0.013438  0.013555  0.230550  200.220486  49.422468   
 
        WTPT-2     WTPT-3  XLogP  Class  
 372  

In [120]:
with pd.option_context('mode.use_inf_as_null', True):
    all_train = all_train.dropna(how='all', axis=1)
    all_test = all_test.dropna(how='all',axis=1)

In [121]:
all_train = all_train[np.isfinite(all_train).all(1)]
all_test = all_test[np.isfinite(all_test).all(1)]

In [134]:
all_train.shape[1] == all_test.shape[1]

True

In [126]:
all_train.head()

Unnamed: 0,Unnamed: 0.1,nAcid,ALogP,ALogp2,AMR,nN,nO,nS,nP,nF,...,JGI8,JGI9,JGI10,VE1_D,VR1_D,WTPT-1,WTPT-2,WTPT-3,XLogP,Class
372,55003,0.0,-2.0083,4.033269,83.5689,2.0,5.0,0.0,0.0,1.0,...,0.011339,0.009392,0.005648,0.00429,252.735571,49.913289,1.996532,21.705622,-0.684,1
360,52834,0.0,-1.5431,2.381158,98.3173,3.0,4.0,1.0,0.0,0.0,...,0.013213,0.009712,0.008328,0.158715,300.918715,52.817771,2.031453,22.609031,1.002,1
799,9091,0.0,0.153,0.023409,98.4411,2.0,3.0,0.0,0.0,0.0,...,0.008273,0.009643,0.005663,0.066221,368.18463,49.516759,2.063198,14.479043,0.926,1
263,82999,0.0,-3.0456,9.275679,102.8853,5.0,1.0,0.0,0.0,0.0,...,0.005603,0.008608,0.005493,0.083425,344.368465,55.858018,2.068815,18.823231,0.074,1
425,76285,0.0,-0.5105,0.26061,93.832,2.0,3.0,0.0,0.0,0.0,...,0.019596,0.013438,0.013555,0.23055,200.220486,49.422468,2.059269,15.418395,0.506,1


In [127]:
y_train = all_train['Class']
x_train = all_train.drop(['Class'], axis=1)

x_test = all_test.drop(['Class'], axis=1)
y_test = all_test['Class']

In [128]:
from sklearn.preprocessing import StandardScaler
xscaler = StandardScaler().fit(x_train)
x_train = xscaler.transform(x_train)
testscaler = StandardScaler().fit(x_test)
x_test = testscaler.transform(x_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  return self.partial_fit(X, y)
  """


In [129]:
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

In [130]:
len(x_train)

990

In [131]:
len(y_train)

990

In [132]:
len(x_train_res)

1626

In [133]:
len(y_train_res)

1626

In [135]:
print(len(x_train),len(y_train),len(x_train_res),len(y_train_res))

990 990 1626 1626


In [139]:
x_train_res = pd.DataFrame(x_train_res)
y_train_res = pd.DataFrame(y_train_res)
x_test = pd.DataFrame(x_test)
x_test = pd.DataFrame(y_test)

In [140]:
x_train_res.to_csv('../../big datasets/drugml/x_train_res.csv')
y_train_res.to_csv('../../big datasets/drugml/y_train_res.csv')
x_test.to_csv('../../big datasets/drugml/x_test.csv')
y_test.to_csv('../../big datasets/drugml/y_test.csv')

  after removing the cwd from sys.path.
