In [1]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2022.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.2/29.2 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
#Import Necessary libraries & path

import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/the-toxicity-prediction-challenge-ii/sample_submission.csv
/kaggle/input/the-toxicity-prediction-challenge-ii/test_II.csv
/kaggle/input/the-toxicity-prediction-challenge-ii/train_II.csv


In [3]:
#Create a function that contains all the features 
def descriptors(smiles, assay_id):
    mol = Chem.MolFromSmiles(smiles)
    molecular_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    molmr = Descriptors.MolMR(mol)
    tpsa = Descriptors.TPSA(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    fp_list = []
    for i in [1, 33, 36, 64, 80, 114, 128, 147, 175, 216, 283, 294, 314, 322,
              356, 378, 389, 392, 428, 561, 650, 656, 659, 695, 698, 726, 794,
              807, 816, 831, 841, 849, 875, 881, 887, 893, 904, 926, 935, 1019]:
        fp_list.append(fp[i])
    return [molecular_weight, logp, assay_id, tpsa] + fp_list


In [4]:
# Load data 
train =pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge-ii/train_II.csv")
test=pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge-ii/test_II.csv")

#Split the Smiles & Assay ID
train[['Smiles', 'Assay ID']] = train["Id"].str.split(";", expand=True)
test[['Smiles', 'Assay ID']] = test['x'].str.split(';', expand=True)

train = train.drop("Id", axis=1)
test = test.drop("x", axis=1)

# Convert SMILES to RDKit Mol 
train['Mol'] = train['Smiles'].apply(Chem.MolFromSmiles)
test['Mol'] = test['Smiles'].apply(Chem.MolFromSmiles)

train = train.dropna(how='any',axis=0) 
test = test.dropna(how='any',axis=0) 

train['Expected'] = train['Expected'] - 1
labels = train['Expected']
train = train.drop("Expected", axis=1)

[01:12:18] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:12:20] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:12:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:12:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:12:23] Explicit valence for atom # 1 Si, 8, is greater than permitted
[01:12:24] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [5]:
# Apply features to the train data

train[['Molecular Weight', 'logp', 'Assay ID', 'tpsa', 'fp_1', 'fp_33', 'fp_36',
       'fp_64', 'fp_80', 'fp_114', 'fp_128', 'fp_147', 'fp_175', 'fp_216',
       'fp_283', 'fp_294', 'fp_314', 'fp_322', 'fp_356', 'fp_378', 'fp_389',
       'fp_392', 'fp_428', 'fp_561', 'fp_650', 'fp_656', 'fp_659', 'fp_695',
       'fp_698', 'fp_726', 'fp_794', 'fp_807', 'fp_816', 'fp_831', 'fp_841',
       'fp_849', 'fp_875', 'fp_881', 'fp_887', 'fp_893', 'fp_904', 'fp_926',
       'fp_935', 'fp_1019']] = train.apply(lambda row: pd.Series(descriptors(row['Smiles'], row['Assay ID'])), axis=1)

In [6]:
# Apply features to the test data

test[['Molecular Weight', 'logp', 'Assay ID', 'tpsa', 'fp_1', 'fp_33', 'fp_36',
       'fp_64', 'fp_80', 'fp_114', 'fp_128', 'fp_147', 'fp_175', 'fp_216',
       'fp_283', 'fp_294', 'fp_314', 'fp_322', 'fp_356', 'fp_378', 'fp_389',
       'fp_392', 'fp_428', 'fp_561', 'fp_650', 'fp_656', 'fp_659', 'fp_695',
       'fp_698', 'fp_726', 'fp_794', 'fp_807', 'fp_816', 'fp_831', 'fp_841',
       'fp_849', 'fp_875', 'fp_881', 'fp_887', 'fp_893', 'fp_904', 'fp_926',
       'fp_935', 'fp_1019']] = test.apply(lambda row: pd.Series(descriptors(row['Smiles'], row['Assay ID'])), axis=1)

In [7]:
# Drop the unnecessary columns

train = train.drop(["Smiles", "Mol"], axis=1)
test = test.drop(["Smiles", "Mol"], axis=1)

In [8]:
# Using Imputer to fix any broken input columns

imputer = SimpleImputer(strategy='mean')
train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

In [9]:
# Define the different xgb and rf models with their respective hyperparameters
xgb_model1 = XGBClassifier(n_estimators=800, max_depth=8, random_state=0, scale_pos_weight=0.6)
xgb_model3 = XGBClassifier(n_estimators=500, max_depth=8, random_state=0, scale_pos_weight=0.6)
lgb_model = LGBMClassifier(n_estimators=800, max_depth=8, random_state=0, scale_pos_weight=0.6)
rf_model1 = RandomForestClassifier(n_estimators=350, random_state=0)

# Create a voting classifier with the different models
ensemble_clf = VotingClassifier(estimators=[('lgb', lgb_model), ('xgb1', xgb_model1), 
                                            ('rf1', rf_model1), ('xgb3', xgb_model3)], voting='hard')


In [10]:
# Fit the ensemble classifier to the training data
ensemble_clf.fit(train, labels)

# Make predictions on the test data
predictions = ensemble_clf.predict(test)

In [11]:
# To fit the requirements of the submission file
predictions = predictions + 1

In [12]:
# Create the submission file
test_data=pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge-ii/test_II.csv")
output = pd.DataFrame({'Id': test_data['x'], 'Predicted': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
