In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melting-point/sample_submission.csv
/kaggle/input/melting-point/train.csv
/kaggle/input/melting-point/test.csv


In [2]:
pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
#Importation des fichiers csv 
folder_path = "/kaggle/input/melting-point"
train_csv_path = '/kaggle/input/melting-point/train.csv'
train_csv = pd.read_csv(train_csv_path)
#print(train_csv.head())

test_csv_path = '/kaggle/input/melting-point/test.csv'
test_csv = pd.read_csv(test_csv_path)
#print(test_csv.head())

In [4]:
#Chargement du Dataset complet
#Full dataset loading
train_data_set = train_csv

y = train_data_set['Tm']
list_columns = train_data_set.columns.tolist()
feature_names = list_columns[3:]

# print the list of columns in the dataset to find the name of the prediction target
#train_data_set.describe()

test_data_set = test_csv

In [5]:
#Retraitement des données sous la forme de 0 et 1 - Train
X_df = pd.DataFrame(train_data_set[feature_names])

#Créer les colonnes binaires
binary_feats = {f"has_{col}": (X_df[col] > 0).astype(int) for col in feature_names}

#Ratios 
ratio_feats = {f"{col}_ratio": X_df[col] / (X_df[feature_names].sum(axis=1) + 1e-6) for col in feature_names}

#Totaux globaux
extra_feats = {
    "total_groups": X_df[feature_names].sum(axis=1),
    "nb_groups_non_zero": (X_df[feature_names] > 0).sum(axis=1),
}

# 5. Fusionner toutes les nouvelles colonnes d’un coup
X_df = pd.concat([X_df, pd.DataFrame({**binary_feats, **ratio_feats, **extra_feats})], axis=1)

print("Shape après feature engineering:", X_df.shape)

Shape après feature engineering: (2662, 1274)


In [6]:
#Traitement des SMILES
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem

X_smiles = pd.DataFrame(train_data_set["SMILES"])

def compute_chem_features(df, smiles_col="smiles", add_fingerprint=True, fp_nbits=2048):
    """
    Prend un DataFrame avec une colonne de SMILES et renvoie un DataFrame de descripteurs chimiques.
    - df : DataFrame d'entrée (ne modifie pas df).
    - smiles_col : nom de la colonne SMILES.
    - add_fingerprint : bool, calcule fingerprint Morgan (binaire) si True.
    - fp_nbits : taille du vecteur fingerprint.
    """
    records = []
    invalid_idx = []
    
    # SMARTS motifs utiles (exemples)
    smarts = {
        "has_OH": "[OX2H]",              # alcool phénol etc. (H-bond donors)
        "has_carbonyl": "[CX3]=O",      # carbonyle
        "has_nitro": "[NX3](=O)=O",     # nitro
        "has_amine": "[NX3;!$(N-*=O)]", # amine primaire/secondaire/tertiaire (élimine nitro)
        "has_ester": "[$([CX3](=O)O)]",  # ester
        "has_thiol": "[SX2H]",           # thiol
    }
    smarts_compiled = {k: Chem.MolFromSmarts(v) for k, v in smarts.items()}
    
    for idx, smi in df[smiles_col].items():
        mol = None
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                # try sanitize false then sanitize
                mol = Chem.MolFromSmiles(smi, sanitize=False)
                if mol is not None:
                    Chem.SanitizeMol(mol)
        except Exception as e:
            mol = None
        
        if mol is None:
            invalid_idx.append(idx)
            # fill NaNs for invalid
            records.append({"index": idx, "valid_smiles": False})
            continue
        
        r = {"index": idx, "valid_smiles": True}
        
        # Atom counts (heavy atoms, heteroatoms, and halogens)
        atom_counts = {}
        for atom in mol.GetAtoms():
            sym = atom.GetSymbol()
            atom_counts[sym] = atom_counts.get(sym, 0) + 1
        # common atoms
        for at in ["C","H","O","N","F","Cl","Br","I","S","P"]:
            r[f"atom_count_{at}"] = atom_counts.get(at, 0)
        r["heavy_atom_count"] = rdMolDescriptors.CalcNumHeavyAtoms(mol)
        
        # Basic descriptors
        r["MolWt"] = Descriptors.ExactMolWt(mol)           # masse exacte
        r["MolLogP"] = Descriptors.MolLogP(mol)            # octanol-water partition
        r["MolMR"] = Descriptors.MolMR(mol)                # molar refractivity (approx)
        r["TPSA"] = rdMolDescriptors.CalcTPSA(mol)         # topological polar surface area
        r["NumHDonors"] = rdMolDescriptors.CalcNumHBD(mol)
        r["NumHAcceptors"] = rdMolDescriptors.CalcNumHBA(mol)
        r["NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol)
        r["NumRings"] = rdMolDescriptors.CalcNumRings(mol)
        r["NumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol)
        r["FractionCSP3"] = Descriptors.FractionCSP3(mol)
        r["NumValenceElectrons"] = sum([a.GetTotalValence() for a in mol.GetAtoms()])
        r["FormalCharge"] = Chem.GetFormalCharge(mol)
        
        # ring/aromaticity / complexity
        r["BertzCT"] = Descriptors.BertzCT(mol)             # complexité topologique
        # optionally: r["BalabanJ"] = Descriptors.J(mol)
        
        # SMARTS motif counts/presence
        for name, patt in smarts_compiled.items():
            try:
                matches = mol.GetSubstructMatches(patt)
                r[f"{name}_count"] = len(matches)
                r[f"{name}_present"] = int(len(matches) > 0)
            except Exception:
                r[f"{name}_count"] = 0
                r[f"{name}_present"] = 0
        
        # hydrogen bonding index (simple)
        r["hbond_index"] = r["NumHDonors"] * r["NumHAcceptors"]
        
        # aromatic proportion
        r["aromatic_atom_fraction"] = sum(1 for a in mol.GetAtoms() if a.GetIsAromatic()) / max(1, mol.GetNumAtoms())
        
        # Morgan fingerprint (ECFP) as bit vector (optional)
        if add_fingerprint:
            try:
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=fp_nbits)
                arr = np.zeros((1,), dtype=np.int8)
                # convert to numpy array
                onbits = list(fp.GetOnBits())
                # We'll create sparse representation: store as list of 0/1 ints in columns fp_0..fp_nbits-1
                for i in range(fp_nbits):
                    r[f"fp_{i}"] = 1 if i in onbits else 0
            except Exception:
                # fallback: no fp
                for i in range(fp_nbits):
                    r[f"fp_{i}"] = 0
        
        records.append(r)
    
    feats = pd.DataFrame(records).set_index("index")
    # Convert types and fill NaN for invalid SMILES
    feats = feats.sort_index()
    feats = feats.fillna(np.nan)
    
    # Optionally drop fp columns if too heavy
    return feats, invalid_idx

# Utilisation :
feats_df, invalids = compute_chem_features(X_smiles, smiles_col="SMILES", add_fingerprint=False)
print("Invalid SMILES indices:", invalids)
X_df = pd.concat([X_df, pd.DataFrame({**feats_df})], axis=1)
print("Shape après feature engineering chimique:", X_df.shape)
print(X_df.head())

Invalid SMILES indices: []
Shape après feature engineering chimique: (2662, 1313)
   Group 1  Group 2  Group 3  Group 4  Group 5  Group 6  Group 7  Group 8  \
0        0        0        0        0        0        0        0        0   
1        0        0        0        0        0        0        0        0   
2        2        1        0        0        0        0        0        0   
3        1        0        0        0        0        0        0        0   
4        2        3        0        0        0        0        0        0   

   Group 9  Group 10  ...  has_nitro_count  has_nitro_present  \
0        0         0  ...                0                  0   
1        0         0  ...                0                  0   
2        0         0  ...                0                  0   
3        0         0  ...                0                  0   
4        0         0  ...                0                  0   

   has_amine_count  has_amine_present  has_ester_count  has_este

In [7]:
#Retraitement du fichier Test
X_test_df = pd.DataFrame(test_data_set[feature_names])
X_test_smiles = pd.DataFrame(test_data_set["SMILES"])
#Créer les colonnes binaires
binary_feats_test = {f"has_{col}": (X_test_df[col] > 0).astype(int) for col in feature_names}
ratio_feats_test = {f"{col}_ratio": X_test_df[col] / (X_test_df[feature_names].sum(axis=1) + 1e-6) for col in feature_names}
extra_feats_test = {
    "total_groups": X_test_df[feature_names].sum(axis=1),
    "nb_groups_non_zero": (X_test_df[feature_names] > 0).sum(axis=1),
}
X_test_df = pd.concat([X_test_df, pd.DataFrame({**binary_feats_test, **ratio_feats_test, **extra_feats_test})], axis=1)
print("Shape après feature engineering:", X_test_df.shape)
feats_df, invalids = compute_chem_features(X_test_smiles, smiles_col="SMILES", add_fingerprint=False)
X_test_df = pd.concat([X_test_df, pd.DataFrame({**feats_df})], axis=1)
print("Shape après feature engineering chimique:", X_test_df.shape)


Shape après feature engineering: (666, 1274)
Shape après feature engineering chimique: (666, 1313)


In [8]:
X = X_df
#print(X.head())
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

#Split Train / Valid
X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.2, random_state=42 )


**Import et Selection du modèle**

In [9]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [10]:
final_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.09586251212066181,
    depth=10,
    l2_leaf_reg=3,
    bagging_temperature=2.031454661334689,
    random_state=42,
    verbose=100
)
final_model.fit(X, y)

0:	learn: 81.5585164	total: 121ms	remaining: 4m 1s
100:	learn: 38.1900204	total: 4.65s	remaining: 1m 27s
200:	learn: 30.7594094	total: 9.13s	remaining: 1m 21s
300:	learn: 26.2343310	total: 13.6s	remaining: 1m 16s
400:	learn: 23.2801404	total: 18.2s	remaining: 1m 12s
500:	learn: 21.0224093	total: 22.6s	remaining: 1m 7s
600:	learn: 19.6460961	total: 27.2s	remaining: 1m 3s
700:	learn: 18.3210771	total: 31.7s	remaining: 58.8s
800:	learn: 17.3820730	total: 36.2s	remaining: 54.2s
900:	learn: 16.5397768	total: 40.7s	remaining: 49.6s
1000:	learn: 15.8597373	total: 45.1s	remaining: 45s
1100:	learn: 15.3554245	total: 49.5s	remaining: 40.4s
1200:	learn: 14.9202185	total: 53.9s	remaining: 35.9s
1300:	learn: 14.5408799	total: 58.4s	remaining: 31.4s
1400:	learn: 14.2562819	total: 1m 2s	remaining: 26.8s
1500:	learn: 14.0073850	total: 1m 7s	remaining: 22.4s
1600:	learn: 13.7868620	total: 1m 11s	remaining: 17.9s
1700:	learn: 13.5687804	total: 1m 16s	remaining: 13.4s
1800:	learn: 13.4186347	total: 1m 20

<catboost.core.CatBoostRegressor at 0x7e5b7c97c110>

In [11]:
#Test de modèle sur données de test
X_test = X_test_df
predictions = final_model.predict(X_test)
print(predictions)

[346.31673283 309.15749654 190.74839606 205.78850612 225.39331487
 340.90703128 235.27645535 327.16318769 274.53202639 266.25534385
 261.18264517 308.78895496 304.04042552 251.67223028 251.80049215
 584.0600492  281.80492259 295.93137342 374.32102192 164.65114301
 147.04649199 141.77548025 386.3425024  205.80866339 301.44608972
 164.80747933 614.88959714 492.76872411 408.15983888 323.36901125
 363.52488431 308.38222747 273.69901221 190.20666885 324.44524181
 260.08917431 351.44898026 281.38651738 224.73118633 342.20381165
 208.26608745 288.60997774 271.86779891 141.70765983 189.29661731
 287.83184717 347.49586114 242.01671021 290.90598881 346.61831662
 328.40872682 335.51394005 117.53050501 305.32726401 293.08958494
 284.44241672 224.28316266 258.12397422 298.32618467 240.95100053
 348.09676789 320.90196973 197.34109108 291.67429247 375.13289834
 210.3064556  284.16516488 316.6717853  285.4662977  307.56169455
 313.53520799 370.10251051 181.41963481 195.2063419  175.95938471
 301.64556

In [12]:
#Export au format csv
id = test_data_set['id']
Tm = predictions
dict = {"id": id, "Tm": Tm}
submission_df = pd.DataFrame(dict)

# Sauvegarder le DataFrame en fichier CSV
output_csv_path = "submission.csv"
submission_df.to_csv(output_csv_path, index=False, sep=',')

print(f"Fichier CSV généré : {output_csv_path}")
print(submission_df)

Fichier CSV généré : submission.csv
       id          Tm
0    1022  346.316733
1    1146  309.157497
2      79  190.748396
3    2279  205.788506
4    1342  225.393315
..    ...         ...
661  2663  282.279142
662   624  282.463590
663  2655  149.903674
664  2089  248.493281
665  1065  279.908626

[666 rows x 2 columns]


In [13]:
from IPython.display import FileLink
FileLink("submission.csv")