In [6]:
pip install rdkit

[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/the-toxicity-prediction-challenge-ii/sample_submission.csv
/kaggle/input/the-toxicity-prediction-challenge-ii/test_II.csv
/kaggle/input/the-toxicity-prediction-challenge-ii/train_II.csv


In [8]:
def descriptors(smiles, assay_id):
    mol = Chem.MolFromSmiles(smiles)
    molecular_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    molmr = Descriptors.MolMR(mol)
    tpsa = Descriptors.TPSA(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return [molecular_weight, logp, assay_id, tpsa] + list(fp)

In [9]:
train =pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge-ii/train_II.csv")
test=pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge-ii/test_II.csv")

train[['Smiles', 'Assay ID']] = train["Id"].str.split(";", expand=True)
test[['Smiles', 'Assay ID']] = test['x'].str.split(';', expand=True)

train = train.drop("Id", axis=1)
test = test.drop("x", axis=1)

# Convert SMILES to RDKit mol object
train['Mol'] = train['Smiles'].apply(Chem.MolFromSmiles)
test['Mol'] = test['Smiles'].apply(Chem.MolFromSmiles)

train = train.dropna(how='any',axis=0) 
test = test.dropna(how='any',axis=0) 

train['Expected'] = train['Expected'] - 1
labels = train['Expected']
train = train.drop("Expected", axis=1)

[00:34:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:40] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:44] Explicit valence for atom # 1 Si, 8, is greater than permitted
[00:34:45] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [10]:
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Create a DataFrame with the descriptors
data = []
for i, row in train.iterrows():
    descriptors_list = descriptors(row['Smiles'], row['Assay ID'])
    data.append(descriptors_list)
df_descriptors = pd.DataFrame(data, columns=['molecular_weight', 'logp', 'assay_id', 'tpsa'] + ['fp_' + str(i) for i in range(1024)])

# Apply variance threshold
threshold = 0.1
selector = VarianceThreshold(threshold=threshold)
selector.fit(df_descriptors)
selected_features = df_descriptors.columns[selector.get_support()]

# Print the selected features
print('Selected features:')
print(selected_features)

Selected features:
Index(['molecular_weight', 'logp', 'assay_id', 'tpsa', 'fp_1', 'fp_33',
       'fp_36', 'fp_64', 'fp_80', 'fp_114', 'fp_128', 'fp_147', 'fp_175',
       'fp_216', 'fp_283', 'fp_294', 'fp_314', 'fp_322', 'fp_356', 'fp_378',
       'fp_389', 'fp_392', 'fp_428', 'fp_561', 'fp_650', 'fp_656', 'fp_659',
       'fp_695', 'fp_698', 'fp_726', 'fp_794', 'fp_807', 'fp_816', 'fp_831',
       'fp_841', 'fp_849', 'fp_875', 'fp_881', 'fp_887', 'fp_893', 'fp_904',
       'fp_926', 'fp_935', 'fp_1019'],
      dtype='object')
