In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       7973 non-null   int64  
 1   SMILES   7973 non-null   object 
 2   Tg       511 non-null    float64
 3   FFV      7030 non-null   float64
 4   Tc       737 non-null    float64
 5   Density  613 non-null    float64
 6   Rg       614 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 436.2+ KB


In [5]:
df2

Unnamed: 0,id,SMILES
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


In [6]:
null_counts = df1.isnull().sum()

In [7]:
null_counts

id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64

In [17]:
df1 = df1.fillna(0)

In [9]:
from sklearn.model_selection import train_test_split
X = df1[['id','SMILES']]
y = df1[['Tg','FFV','Tc','Density','Rg']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
from rdkit import Chem
from rdkit.Chem import Descriptors

df1['Mol'] = df1['SMILES'].apply(lambda x: Chem.MolFromSmiles(x) if isinstance(x, str) else None)
df1['NumAtoms'] = df1['Mol'].apply(lambda mol: mol.GetNumAtoms() if mol else 0)
df1['MolWt'] = df1['Mol'].apply(lambda mol: Descriptors.MolWt(mol) if mol else 0)
df1['LogP'] = df1['Mol'].apply(lambda mol: Descriptors.MolLogP(mol) if mol else 0)
df1['NumHAcceptors'] = df1['Mol'].apply(lambda mol: Descriptors.NumHAcceptors(mol) if mol else 0)
df1['NumHDonors'] = df1['Mol'].apply(lambda mol: Descriptors.NumHDonors(mol) if mol else 0)
df1['TPSA'] = df1['Mol'].apply(lambda mol: Descriptors.TPSA(mol) if mol else 0)
df1

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg,Mol,NumAtoms,MolWt,LogP,NumHAcceptors,NumHDonors,TPSA
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,0.000000,0.374645,0.205667,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002679D6...,19,232.323,3.98170,2,0,26.30
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,0.000000,0.370410,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002679D6...,47,598.919,12.35960,2,2,24.06
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,0.000000,0.378860,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002679D6...,75,1003.207,14.21700,9,0,122.27
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,0.000000,0.387324,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002679D6...,44,542.726,11.00768,2,2,24.06
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,0.000000,0.355470,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000002679D6...,72,965.154,11.84500,14,0,182.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7968,2146592435,*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1,0.000000,0.367498,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x00000267A07...,28,352.430,5.34140,4,0,52.60
7969,2146810552,*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...,0.000000,0.353280,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x00000267A07...,58,750.680,6.13530,13,0,198.46
7970,2147191531,*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...,0.000000,0.369411,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x00000267A07...,41,521.573,4.47940,5,1,103.86
7971,2147435020,*C=C(*)c1ccccc1C,261.662355,0.000000,0.000000,0.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x00000267A07...,11,116.163,2.39202,0,0,0.00


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Feature and Target Selection
X = df1[['NumAtoms', 'MolWt', 'LogP', 'NumHAcceptors', 'NumHDonors', 'TPSA']]
y = df1[['Tg', 'FFV', 'Tc', 'Density', 'Rg']]  # Multi-target regression
columns = y.columns.tolist()

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define Regression Models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR()
}

# Train and Evaluate Each Model
for name, model in models.items():
    print(f"\n🔹 {name}")
    
    multi_model = MultiOutputRegressor(model)
    multi_model.fit(X_train, y_train)
    y_pred = multi_model.predict(X_test)

    y_pred_df = pd.DataFrame(y_pred, columns=columns).reset_index(drop=True)
    y_test_df = y_test.reset_index(drop=True)

    # Step 1: Calculate MAE, MSE, R²
    maes = {}
    for col in columns:
        mse = mean_squared_error(y_test_df[col], y_pred_df[col])
        mae = mean_absolute_error(y_test_df[col], y_pred_df[col])
        r2 = r2_score(y_test_df[col], y_pred_df[col])
        maes[col] = mae
        print(f"  🧪 {col}: MSE = {mse:.4f}, MAE = {mae:.4f}, R² = {r2:.4f}")

    # Step 2: Compute wMAE
    n_k = {col: y_test_df[col].notnull().sum() for col in columns}
    range_k = {
        col: y_test_df[col].max() - y_test_df[col].min() if y_test_df[col].max() != y_test_df[col].min() else 1e-6
        for col in columns
    }
    raw_weights = {
        col: 1 / (np.sqrt(n_k[col]) * range_k[col])
        for col in columns
    }
    sum_weights = sum(raw_weights.values())
    weights = {col: raw_weights[col] / sum_weights for col in columns}
    wmae = sum(weights[col] * maes[col] for col in columns)

    print(f"  📊 Weighted MAE (wMAE): {wmae:.6f}")



🔹 Linear Regression
  🧪 Tg: MSE = 973.7148, MAE = 11.5216, R² = -0.0040
  🧪 FFV: MSE = 0.0129, MAE = 0.0725, R² = 0.0685
  🧪 Tc: MSE = 0.0058, MAE = 0.0439, R² = 0.1125
  🧪 Density: MSE = 0.0631, MAE = 0.1397, R² = 0.0870
  🧪 Rg: MSE = 19.5281, MAE = 2.3542, R² = 0.0824
  📊 Weighted MAE (wMAE): 0.088516

🔹 Ridge Regression
  🧪 Tg: MSE = 973.7134, MAE = 11.5216, R² = -0.0040
  🧪 FFV: MSE = 0.0129, MAE = 0.0725, R² = 0.0685
  🧪 Tc: MSE = 0.0058, MAE = 0.0439, R² = 0.1125
  🧪 Density: MSE = 0.0631, MAE = 0.1397, R² = 0.0870
  🧪 Rg: MSE = 19.5281, MAE = 2.3541, R² = 0.0824
  📊 Weighted MAE (wMAE): 0.088515

🔹 Lasso Regression
  🧪 Tg: MSE = 969.7410, MAE = 11.4083, R² = 0.0001
  🧪 FFV: MSE = 0.0132, MAE = 0.0731, R² = 0.0427
  🧪 Tc: MSE = 0.0063, MAE = 0.0423, R² = 0.0318
  🧪 Density: MSE = 0.0664, MAE = 0.1387, R² = 0.0386
  🧪 Rg: MSE = 20.4198, MAE = 2.3359, R² = 0.0405
  📊 Weighted MAE (wMAE): 0.087760

🔹 Decision Tree
  🧪 Tg: MSE = 1953.3843, MAE = 11.0684, R² = -1.0141
  🧪 FFV: MSE = 

In [12]:
 df2

Unnamed: 0,id,SMILES
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


In [14]:
df3 = pd.read_csv('sample_submission.csv')
df3


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,0,0,0,0,0
1,1422188626,0,0,0,0,0
2,2032016830,0,0,0,0,0


In [18]:
final_model = RandomForestRegressor()
final_model.fit(X,y)

In [20]:
df2['Mol'] = df2['SMILES'].apply(Chem.MolFromSmiles)
df2['NumAtoms'] = df2['Mol'].apply(lambda mol: mol.GetNumAtoms() if mol else 0)
df2['MolWt'] = df2['Mol'].apply(lambda mol: Descriptors.MolWt(mol) if mol else 0)
df2['LogP'] = df2['Mol'].apply(lambda mol: Descriptors.MolLogP(mol) if mol else 0)
df2['NumHAcceptors'] = df2['Mol'].apply(lambda mol: Descriptors.NumHAcceptors(mol) if mol else 0)
df2['NumHDonors'] = df2['Mol'].apply(lambda mol: Descriptors.NumHDonors(mol) if mol else 0)
df2['TPSA'] = df2['Mol'].apply(lambda mol: Descriptors.TPSA(mol) if mol else 0)

X_submission = df2[['NumAtoms', 'MolWt', 'LogP', 'NumHAcceptors', 'NumHDonors', 'TPSA']]
submission_preds = final_model.predict(X_submission)

submission_df = pd.DataFrame(submission_preds, columns=['Tg', 'FFV', 'Tc', 'Density', 'Rg'])
submission_df.insert(0, 'id', df2['id'])
submission_df.to_csv("submission.csv", index=False)
