In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [6]:
train=pd.read_csv('/content/train.csv')
test=pd.read_csv('/content/test.csv')
ss=pd.read_csv('/content/sample_submission.csv')
ID=test['id'].copy()


In [7]:
train.head()


Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [10]:
# Inspect data
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:", train.columns.tolist())


Train shape: (7973, 7)
Test shape: (3, 2)
Train columns: ['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']


In [12]:
print("Missing values in train:\n", train.isna().sum())

Missing values in train:
 id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64


Feature Extraction


In [13]:
# Get all available RDKit descriptors
descriptor_names = [desc_name for desc_name, _ in Descriptors.descList]

# Function to compute all descriptors
def compute_all_descriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return None

    descriptor_values = {}
    for name, func in Descriptors.descList:
        try:
            descriptor_values[name] = func(mol)
        except:
            descriptor_values[name] = None  # In case a descriptor fails

    descriptor_values['SMILES'] = smile
    return descriptor_values

# Apply to all SMILES
data_t = [compute_all_descriptors(smi) for smi in train['SMILES']]
data_ts=[compute_all_descriptors(smi) for smi in test['SMILES']]
data_t = [d for d in data_t if d is not None]
data_ts = [d for d in data_ts if d is not None]
train_df = pd.DataFrame(data_t)
test_df = pd.DataFrame(data_ts)

# Move SMILES column to the front
cols_t = ['SMILES'] + [c for c in train_df.columns if c != 'SMILES']
cols_ts = ['SMILES'] + [c for c in test_df.columns if c != 'SMILES']
train_df = train_df[cols_t]
test_df = test_df[cols_ts]

In [14]:
# Here we will merge old train with trian_df which have new descriptor features
train=train.merge(train_df,on='SMILES',how='left')
test=test.merge(test_df,on='SMILES',how='left')

Preprocessing

In [15]:
# We'll separate train to be one model for each target variable.
t_1=train[['SMILES','Tg']].copy()
t_2=train[['SMILES','FFV']].copy()
t_3=train[['SMILES','Tc']].copy()
t_4=train[['SMILES','Density']].copy()
t_5=train[['SMILES','Rg']].copy()

# We will drop the rows with missing values related to that target after separation.
#This is important , dropping them beforehand would result Null for all data.
t_1.dropna(inplace=True)
t_2.dropna(inplace=True)
t_3.dropna(inplace=True)
t_4.dropna(inplace=True)
t_5.dropna(inplace=True)


In [16]:
# we'll drop certain descriptors (features) that contain missing values across the dataset
test=test.drop(['id','BCUT2D_MWLOW','BCUT2D_MWHI','BCUT2D_CHGHI','BCUT2D_CHGLO','BCUT2D_LOGPHI','BCUT2D_LOGPLOW','BCUT2D_MRLOW','BCUT2D_MRHI','MinAbsPartialCharge','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','SMILES'],axis=1)
train=train.drop(['id','BCUT2D_MWLOW','BCUT2D_MWHI','BCUT2D_CHGHI','BCUT2D_CHGLO','BCUT2D_LOGPHI','BCUT2D_LOGPLOW','BCUT2D_MRLOW','BCUT2D_MRHI','MinAbsPartialCharge','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','Tg','FFV','Tc','Density','Rg'],axis=1)


In [17]:
tg=t_1.merge(train,on='SMILES',how='left')
ffv=t_2.merge(train,on='SMILES',how='left')
tc=t_3.merge(train,on='SMILES',how='left')
density=t_4.merge(train,on='SMILES',how='left')
rg=t_5.merge(train,on='SMILES',how='left')

In [18]:
tg.shape,ffv.shape ,tc.shape ,density.shape ,rg.shape


((511, 207), (7030, 207), (737, 207), (613, 207), (614, 207))

Model

In [21]:
# Let’s define a reusable function to train and evaluate our machine learning model.

def model(train_d,test_d,model,target,submission=False):
    # We divide the data into training and validation sets for model evaluation
    X=train_d.drop([target,'SMILES'],axis=1)
    y=train_d[target].copy()
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

    Model=model(   nan_mode='Min',  # How to handle NaN: 'Min', 'Max', or 'Forbidden'
                   verbose=False)
    if submission==False:
       Model.fit(X_train,y_train)
       y_pred=Model.predict(X_test)
       return mean_absolute_error(y_pred,y_test)         # We assess our model performance using MAE metric
    if submission==True:
       Model.fit(X,y)
       submission=Model.predict(test_d)
       return submission

Model Evaluation

In [22]:
model(tg,test,CatBoostRegressor,'Tg',submission=False)


48.80142141226609

In [23]:
model(ffv,test,CatBoostRegressor,'FFV',submission=False)

0.006809310888318689

In [24]:
model(tc,test,CatBoostRegressor,'Tc',submission=False)

0.029719767075631592

In [25]:
model(density,test,CatBoostRegressor,'Density',submission=False)

0.03478487090587355

In [26]:
model(rg,test,CatBoostRegressor,'Rg',submission=False)

1.8260872927969645

Final Model For Submission

In [27]:
# Finally, we use the model to predict on the test set and prepare the submission file.

sub={'id':ID,'Tg':model(tg,test,CatBoostRegressor,'Tg',submission=True),
     'FFV':model(ffv,test,CatBoostRegressor,'FFV',submission=True),
     'Tc':model(tc,test,CatBoostRegressor,'Tc',submission=True),
     'Density':model(density,test,CatBoostRegressor,'Density',submission=True),
     'Rg':model(rg,test,CatBoostRegressor,'Rg',submission=True)}

In [28]:
submission=pd.DataFrame(sub)


In [29]:
submission


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,148.094227,0.373955,0.196392,1.154528,21.991347
1,1422188626,187.241686,0.379458,0.226463,1.086217,20.694287
2,2032016830,114.090183,0.353189,0.254764,1.139422,20.818989


In [30]:
submission.to_csv('submission.csv',index=False)