In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier

In [None]:
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, PandasTools, rdMolDescriptors

Collecting rdkit
  Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.4


In [25]:
hiv = pd.read_csv('/content/HIV (1).csv')
hiv

Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0
...,...,...,...
41122,CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...,CI,0
41123,Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...,CI,0
41124,Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...,CI,0
41125,Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...,CI,0


In [52]:
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)

def smi_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
      return dict(zip(descriptor_names, get_descriptors.ComputeProperties(mol)))
    else:
      return {name: None for name in descriptor_names}

hiv_descriptors= hiv['smiles'].apply(smi_to_descriptors).apply(pd.Series)
hiv = pd.concat([hiv, hiv_descriptors], axis=1)

[12:34:20] Explicit valence for atom # 3 Al, 6, is greater than permitted
[12:34:23] Explicit valence for atom # 5 B, 5, is greater than permitted
[12:34:55] Explicit valence for atom # 16 Al, 9, is greater than permitted
[12:35:11] Explicit valence for atom # 4 Al, 9, is greater than permitted
[12:35:50] Explicit valence for atom # 12 Al, 7, is greater than permitted
[12:35:50] Explicit valence for atom # 13 Al, 7, is greater than permitted
[12:36:05] Explicit valence for atom # 6 Ge, 5, is greater than permitted


In [53]:
print(hiv.head())
hiv.shape

                                              smiles activity  HIV_active  \
0  CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...       CI           0   
1  C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...       CI           0   
2                   CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21       CI           0   
3    Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1       CI           0   
4                             O=S(=O)(O)CCS(=O)(=O)O       CI           0   

      exactmw      amw  lipinskiHBA  lipinskiHBD  NumRotatableBonds  NumHBD  \
0  319.096508  319.888          4.0          0.0                4.0     0.0   
1  563.127808  564.140          4.0          0.0                6.0     0.0   
2  291.071785  291.375          2.0          0.0                0.0     0.0   
3  370.029328  370.408          8.0          6.0                4.0     4.0   
4  189.960580  190.198          6.0          2.0                3.0     2.0   

   NumHBA  ...      chi0n      chi1n     chi2n     chi3n     c

(41127, 46)

In [54]:
hiv_cleaned = hiv.dropna()
hiv_cleaned.to_csv('hiv_updated.csv', index=False)
model = pd.read_csv('/content/hiv_updated.csv')
model.shape

(41120, 46)

In [80]:
X = model.drop(columns=['smiles', 'activity','HIV_active'])
y = model['HIV_active']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=35)

In [56]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32896, 43)
(32896,)
(8224, 43)
(8224,)


In [89]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=35)
X_train_new = selector.fit_transform(X_train, y_train)

RF_clf = RandomForestClassifier()
RF_clf.fit(X_train_new, y_train)

X_test_new = selector.transform(X_test)

In [90]:
print(X_train_new.shape)
print(y_train.shape)
print(X_test_new.shape)
print(y_test.shape)

(32896, 35)
(32896,)
(8224, 35)
(8224,)


In [91]:
RF_pred = RF_clf.predict(X_test_new)
print('Accuracy score_RF =', accuracy_score(RF_pred, y_test))
print('f1-score_RF =', f1_score(y_test, RF_pred, average=None))
print('RF =',confusion_matrix(RF_pred, y_test))

Accuracy score_RF = 0.967534046692607
f1-score_RF = [0.98339036 0.28418231]
RF = [[7904  251]
 [  16   53]]
