In [159]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, KFold, learning_curve
from sklearn.metrics import mean_absolute_error, make_scorer

tqdm.tqdm.pandas()

--- 1. Chargement et Préparation des Données ---

In [160]:
print("\n--- Étape 1: Chargement et Préparation des Données ---")
datas = pd.read_csv('datasets/train.csv')

properties = datas.columns.tolist()[2:]
specialized_datasets = {}
for prop in properties:
    prop_df = datas[["SMILES", prop]].dropna()
    specialized_datasets[prop] = prop_df
    print(f"Dataset pour {prop}: {len(prop_df)} échantillons")

test_df = pd.read_csv('datasets/test.csv')
test_df = test_df.dropna(subset=["SMILES"])


--- Étape 1: Chargement et Préparation des Données ---
Dataset pour Tg: 511 échantillons
Dataset pour FFV: 7030 échantillons
Dataset pour Tc: 737 échantillons
Dataset pour Density: 613 échantillons
Dataset pour Rg: 614 échantillons


--- 2. Ingénierie des Caractéristiques avec RDKit et Morgan ---

In [161]:
print("\n--- Étape 2: Ingénierie des Caractéristiques avec RDKit ---")
desc_names = list(Descriptors.CalcMolDescriptors(Chem.MolFromSmiles('C')).keys())
feature_names = desc_names + [f'morgan_fp_{i}' for i in range(1024)]

# Initialiser le générateur de Morgan Fingerprints
morgan_generator = GetMorganGenerator(radius=2, fpSize=1024, includeChirality=True)

# Preprocessing the SMILES data
def preprocess_smile(smile: str) -> list:
    """
    Preprocess a SMILES string and return a DataFrame with the processed data.
    
    Parameters:
    smile (str): The SMILES string to preprocess.
    
    Returns:
    list: A list containing the processed SMILES data.
    """
    mol = Chem.MolFromSmiles(smile.replace('*', '[H]'))  # Replace * by [H] to obtain usable descriptors/fingerprints
    if mol is None:
        return None
    
    # Descripteurs physico-chimiques : 200
    descriptors = Descriptors.CalcMolDescriptors(mol)
    # Morgan Fingerprints : 1024
    fp = morgan_generator.GetFingerprint(mol)
    fp_list = list(map(int, fp.ToBitString()))

    features = list(descriptors.values()) + [int(x) for x in fp_list]
    if any([pd.isna(f) or np.isinf(f) for f in features]):
        return None
    return features

prepared_data = {}
for prop, df in specialized_datasets.items():
    print(f"Generation of features for {prop}...")
    features_list = df["SMILES"].progress_apply(preprocess_smile).tolist()

    valid_indices = [i for i, features in enumerate(features_list) if features is not None]
    features_list = [features_list[i] for i in valid_indices]
    valid_df = df.iloc[valid_indices]
    if len(features_list) == 0:
        print(f"No valid descriptor for {prop}.")
        continue

    X = pd.DataFrame(features_list, columns=feature_names, index=valid_df.index)
    y = valid_df[prop]

    prepared_data[prop] = {"input": X, "output": y}

    print(f"Dataset for {prop}: {len(X)} samples")


--- Étape 2: Ingénierie des Caractéristiques avec RDKit ---
Generation of features for Tg...


 57%|█████▋    | 293/511 [00:06<00:04, 51.30it/s][01:41:15] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 22 23 24 25 26 27 28 29
 89%|████████▊ | 453/511 [00:10<00:01, 47.27it/s][01:41:18] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 31 32 33 34 35 36 37 38
100%|██████████| 511/511 [00:11<00:00, 45.95it/s]


Dataset for Tg: 505 samples
Generation of features for FFV...


 32%|███▏      | 2262/7030 [00:55<01:38, 48.37it/s][01:42:15] Explicit valence for atom # 1 H, 2, is greater than permitted
 33%|███▎      | 2335/7030 [00:57<01:42, 45.73it/s][01:42:16] Explicit valence for atom # 1 H, 2, is greater than permitted
 35%|███▍      | 2435/7030 [00:59<01:54, 40.23it/s][01:42:19] Explicit valence for atom # 1 H, 2, is greater than permitted
 70%|███████   | 4930/7030 [02:02<00:47, 44.10it/s][01:43:21] Explicit valence for atom # 1 H, 2, is greater than permitted
 80%|████████  | 5644/7030 [02:19<00:27, 50.18it/s][01:43:39] Explicit valence for atom # 1 H, 2, is greater than permitted
100%|██████████| 7030/7030 [02:54<00:00, 40.31it/s]


Dataset for FFV: 7005 samples
Generation of features for Tc...


100%|██████████| 737/737 [00:10<00:00, 68.28it/s]


Dataset for Tc: 737 samples
Generation of features for Density...


100%|██████████| 613/613 [00:08<00:00, 68.71it/s]


Dataset for Density: 613 samples
Generation of features for Rg...


100%|██████████| 614/614 [00:08<00:00, 69.39it/s]


Dataset for Rg: 614 samples


In [162]:
scaler_params = {}
for elt in tqdm.tqdm(prepared_data):
    scaler_params[elt] = []
    for col in tqdm.tqdm(prepared_data[elt]['input']):
        scaler = StandardScaler()
        prepared_data[elt]['input'][col] = scaler.fit_transform(prepared_data[elt]['input'][[col]])
        scaler_params[elt].append(scaler)

    X_train, X_val, y_train, y_val = train_test_split(prepared_data[elt]['input'], prepared_data[elt]['output'], test_size=0.2, random_state=42)
    prepared_data[elt]['input_train'] = X_train
    prepared_data[elt]['output_train'] = y_train
    prepared_data[elt]['input_val'] = X_val
    prepared_data[elt]['output_val'] = y_val

  0%|          | 0/5 [00:00<?, ?it/s]

1241it [00:03, 388.15it/s]
 18%|█▊        | 1241/7005 [00:03<00:15, 379.71it/s]
1241it [00:02, 415.63it/s]6<00:10,  3.35s/it]
1241it [00:02, 416.10it/s]9<00:06,  3.20s/it]
1241it [00:03, 408.44it/s]2<00:03,  3.14s/it]
100%|██████████| 5/5 [00:15<00:00,  3.16s/it]


--- 3. Implémentation des fonctions d'évaluation wMAE et de visualisation ---

In [163]:
# Reweighting factor
def reweighting_factor(column_name: str, k: int) -> float:
    """
    Calculate the reweighting factor for a given column in a DataFrame.
    
    Parameters:
    column_name (str): The name of the column to calculate the reweighting factor for.
    k (int): The total number of tasks.
    
    Returns:
    float: The reweighting factor.
    """
    ri = datas[column_name].max() - datas[column_name].min() + 1e-6
    denominator = sum([(1 / len(datas[data].dropna()))**.5 for data in datas.head() if data not in ['id', 'SMILES']])
    return (k * (1 / len(datas[column_name].dropna()))**.5) / (ri * denominator)

weights = {
    'Tg' : reweighting_factor('Tg', 5),
    'FFV' : reweighting_factor('FFV', 5),
    'Tc' : reweighting_factor('Tc', 5),
    'Density' : reweighting_factor('Density', 5),
    'Rg' : reweighting_factor('Rg', 5)
}

# Single Weighted Mean Absolute Error
def single_prop_wMAE(y_true, y_pred, weight):
    """
    Calcule le MAE pondéré pour une seule propriété.
    
    Paramètres :
    y_true (array-like) : Valeurs réelles pour une seule propriété.
    y_pred (array-like) : Valeurs prédites pour une seule propriété.
    weight (float) : Le poids spécifique de cette propriété.
    
    Retourne :
    float : Le MAE pondéré.
    """
    error = np.abs(y_true - y_pred)
    weighted_error = weight * error
    return np.mean(weighted_error)

# Weighted Mean Absolute Error
def wMAE(y_true: list, y_pred: list) -> float:
    """
    Calculate the weighted Mean Absolute Error.
    
    Parameters:
    y_true (array-like): True values.
    y_pred (array-like): Predicted values.
    
    Returns:
    float: The weighted Mean Absolute Error.
    """
    errors = 0
    for i in tqdm.tqdm(range(len(y_true)), desc='Calculating wMAE'):
        error = 0
        for x in ['Tg', 'FFV', 'Tc', 'Density', 'Rg']:
            error += weights[x] * abs((y_pred[i][x] or 0) - (y_true[i][x] or 0))
        errors += error
    return errors / len(y_true)

--- 4. Optimisation des modèles avec GridSearchCV et génération des courbes d'apprentissage ---

In [164]:
print("Configuration des modèles et hyperparamètres...")
models_params = {
    "RandomForest": (
        RandomForestRegressor(random_state=42, n_jobs=-1),
        {
            "n_estimators": [100, 300],
            "max_depth": [10, 20, None],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt", 1.0]
        }
    ),
    "LightGBM": (
        LGBMRegressor(random_state=42, n_jobs=-1),
        {
            "n_estimators": [200, 500],
            "learning_rate": [0.01, 0.05],
            "num_leaves": [31, 50],
            "max_depth": [-1, 10],
            "subsample": [0.7, 1.0],
            "colsample_bytree": [0.7, 1.0]
        }
    ),
    "Ridge": (
        Ridge(),
        {
            "alpha": [0.1, 1.0, 10.0]
        }
    ),
    "SVR": (
        SVR(),
        {
            "C": [1, 10],
            "kernel": ["rbf"],
            "gamma": ["scale"]
        }
    ),
    "KNN": (
        KNeighborsRegressor(n_jobs=-1),
        {
            "n_neighbors": [3, 5, 10],
            "weights": ["uniform", "distance"]
        }
    ),
    "MLP": (
        MLPRegressor(random_state=42, max_iter=500, early_stopping=True),
        {
            "hidden_layer_sizes": [(50,), (100,), (50, 25), (64, 32)],
            "activation": ["relu"],
            "solver": ["adam"],
            "alpha": [0.001, 0.01],
            "learning_rate_init": [0.001]
        }
    )
}

Configuration des modèles et hyperparamètres...


In [165]:
print("\n--- Étape 4: Optimisation des modèles et courbes d'apprentissage ---")
best_estimators_overall = {}
oof_predictions = {}

for prop in tqdm.tqdm(properties, desc="Optimizing Models"):
    print(f"\n===== Optimizing for property: {prop} =====")
    input = prepared_data[prop]['input_train']
    output = prepared_data[prop]['output_train']
    best_model_for_prop = None
    best_score_for_prop = float("inf")
    best_model_name_for_prop = ""

    wMAE_scorer = make_scorer(
        single_prop_wMAE, 
        greater_is_better=False, 
        weight=weights[prop]
    )

    for model_name, (model, params) in models_params.items():
        print(f"Optimizing {model_name} for {prop}...")
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=params,
            scoring=wMAE_scorer,
            cv=KFold(n_splits=5),
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(input, output)

        best_model_found = grid_search.best_estimator_
        best_score_found = -grid_search.best_score_
        print(f"Best score for {model_name}: {best_score_found:.4f}")
        print(f"Best params for {model_name} on {prop}: {grid_search.best_params_}")

        
        if best_score_found < best_score_for_prop:
            best_score_for_prop = best_score_found
            best_estimators_overall[prop] = best_model_found
            best_model_name_for_prop = model_name
            print(f"*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour {prop}: {model_name} ***")
    print(f"\n>>> Résultat final pour {prop}: Le meilleur modèle est {best_model_name_for_prop} avec une MAE de {best_score_for_prop:.4f} <<<")


--- Étape 4: Optimisation des modèles et courbes d'apprentissage ---


Optimizing Models:   0%|          | 0/5 [00:00<?, ?it/s]


===== Optimizing for property: Tg =====
Optimizing RandomForest for Tg...
Best score for RandomForest: 0.1071
Best params for RandomForest on Tg: {'max_depth': 20, 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 300}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour Tg: RandomForest ***
Optimizing LightGBM for Tg...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10300
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 290
[LightGBM] [Info] Start training from score 97.036870
Best score for LightGBM: 0.1058
Best params for LightGBM on Tg: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'num_leaves': 31, 'subsample': 0.7}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour Tg: LightGBM ***
Optimizing Ridge for Tg...
Best score for Ridge: 0.1443
Best params for Ridge on Tg: {'alpha

Optimizing Models:  20%|██        | 1/5 [04:42<18:49, 282.39s/it]

Best score for MLP: 0.1402
Best params for MLP on Tg: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.001, 'solver': 'adam'}

>>> Résultat final pour Tg: Le meilleur modèle est LightGBM avec une MAE de 0.1058 <<<

===== Optimizing for property: FFV =====
Optimizing RandomForest for FFV...
Best score for RandomForest: 0.0046
Best params for RandomForest on FFV: {'max_depth': None, 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 300}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour FFV: RandomForest ***
Optimizing LightGBM for FFV...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26358
[LightGBM] [Info] Number of data points in the train set: 5604, number of used features: 1096
[LightGBM] [Info] Start training from score 0.367481
Best score for LightGBM: 0.0041
Best params for LightGBM on FFV: {'c

Optimizing Models:  40%|████      | 2/5 [1:25:58<2:29:14, 2984.84s/it]

Best score for MLP: 0.0052
Best params for MLP on FFV: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50, 25), 'learning_rate_init': 0.001, 'solver': 'adam'}

>>> Résultat final pour FFV: Le meilleur modèle est LightGBM avec une MAE de 0.0041 <<<

===== Optimizing for property: Tc =====
Optimizing RandomForest for Tc...
Best score for RandomForest: 0.0622
Best params for RandomForest on Tc: {'max_depth': None, 'max_features': 1.0, 'min_samples_leaf': 2, 'n_estimators': 300}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour Tc: RandomForest ***
Optimizing LightGBM for Tc...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12707
[LightGBM] [Info] Number of data points in the train set: 589, number of used features: 262
[LightGBM] [Info] Start training from score 0.256983
Best score for LightGBM: 0.0624
Best params for LightGBM on Tc: {'colsampl

Optimizing Models:  60%|██████    | 3/5 [1:31:19<58:56, 1768.40s/it]  

Best score for MLP: 0.6338
Best params for MLP on Tc: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50, 25), 'learning_rate_init': 0.001, 'solver': 'adam'}

>>> Résultat final pour Tc: Le meilleur modèle est RandomForest avec une MAE de 0.0622 <<<

===== Optimizing for property: Density =====
Optimizing RandomForest for Density...
Best score for RandomForest: 0.0372
Best params for RandomForest on Density: {'max_depth': None, 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 300}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour Density: RandomForest ***
Optimizing LightGBM for Density...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10802
[LightGBM] [Info] Number of data points in the train set: 490, number of used features: 254
[LightGBM] [Info] Start training from score 0.985889
Best score for LightGBM: 0.0367
Best params for

Optimizing Models:  80%|████████  | 4/5 [1:35:51<19:37, 1177.68s/it]

Best score for MLP: 0.3178
Best params for MLP on Density: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.001, 'solver': 'adam'}

>>> Résultat final pour Density: Le meilleur modèle est LightGBM avec une MAE de 0.0367 <<<

===== Optimizing for property: Rg =====
Optimizing RandomForest for Rg...
Best score for RandomForest: 0.0932
Best params for RandomForest on Rg: {'max_depth': 10, 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 300}
*** NOUVEAU MEILLEUR MODÈLE GLOBAL pour Rg: RandomForest ***
Optimizing LightGBM for Rg...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10856
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 254
[LightGBM] [Info] Start training from score 16.347523
Best score for LightGBM: 0.0939
Best params for LightGBM on Rg: {'c

Optimizing Models: 100%|██████████| 5/5 [1:40:36<00:00, 1207.25s/it]

Best score for MLP: 0.1977
Best params for MLP on Rg: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.001, 'solver': 'adam'}

>>> Résultat final pour Rg: Le meilleur modèle est RandomForest avec une MAE de 0.0932 <<<





In [166]:
# Génération des prédictions Out-of-Fold pour le calcul du wMAE global
print("\nGenerating OOF predictions for global wMAE calculation...")
for prop in properties:
    X = prepared_data[prop]["input_val"]
    y = prepared_data[prop]["output_val"]
    model = best_estimators_overall[prop]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds_prop = np.zeros(len(X))
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        oof_preds_prop[val_idx] = model.predict(X_val)
        
    oof_predictions[prop] = oof_preds_prop


Generating OOF predictions for global wMAE calculation...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2368
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 136
[LightGBM] [Info] Start training from score 93.048913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2405
[LightGBM] [Info] Number of data points in the train set: 81, number of used features: 136
[LightGBM] [Info] Start training from score 86.063146
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2413
[LightGBM] [Info] Number of data points in the train set: 81, n

--- 5. Évaluation finale avec la métrique wMAE et génération du fichier de soumission ---

In [167]:
print("\n--- Étape 5: Évaluation finale et génération du fichier de soumission ---")

def calculate_global_wmae(oof_preds: dict, true_values: dict, weights: dict):
    properties = list(oof_preds.keys())
    K = len(properties)
    
    total_weighted_error = 0
    
    for prop in properties:
        preds = oof_preds[prop]
        trues = true_values[prop]
        
        mae = mean_absolute_error(trues, preds)
        total_weighted_error += weights[prop] * mae
        
    return total_weighted_error / K

# Préparer les vraies valeurs pour le calcul du wMAE global
true_values_for_wmae = {prop: prepared_data[prop]["output_val"].values for prop in properties}

# Calcul du score wMAE global
final_wmae_score = calculate_global_wmae(oof_predictions, true_values_for_wmae, weights)

print(f"\n=========================================")
print(f"Global Estimated wMAE Score (from OOF predictions): {final_wmae_score:.4f}")
print(f"=========================================")


--- Étape 5: Évaluation finale et génération du fichier de soumission ---

Global Estimated wMAE Score (from OOF predictions): 0.0734


Ré-entraîner les meilleurs modèles sur toutes leurs données respectives pour la prédiction finale

In [168]:
print("\nRe-training final models on all available data for test set prediction...")
for prop in properties:
    X = prepared_data[prop]["input"]
    y = prepared_data[prop]["output"]
    best_estimators_overall[prop].fit(X, y)
print("Final models are ready.")


Re-training final models on all available data for test set prediction...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12801
[LightGBM] [Info] Number of data points in the train set: 505, number of used features: 336
[LightGBM] [Info] Start training from score 95.027517
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26824
[LightGBM] [Info] Number of data points in the train set: 7005, number of used features: 1147
[LightGBM] [Info] Start training from score 0.367172
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13466
[LightGBM] [Info] Number of data points i

Génération des caractéristiques pour le jeu de test

In [169]:
print("\nGenerating features for the test set...")
test_features_list = test_df["SMILES"].progress_apply(preprocess_smile).tolist()

# Filtrer les entrées None si des SMILES invalides ont été rencontrés
valid_test_indices = [i for i, features in enumerate(test_features_list) if features is not None]
valid_test_df = test_df.iloc[valid_test_indices]
test_features_list = [test_features_list[i] for i in valid_test_indices]

X_test = pd.DataFrame(test_features_list, columns=feature_names, index=valid_test_df.index)

# for i in tqdm.tqdm(range(len(X_test)), desc="Standardizing test features"):
#     for col in X_test.columns:
#         X_test[col][i] = scaler_params[col].transform(X_test[[col]])[i]

# Créer le DataFrame de soumission
submission_df = pd.DataFrame(valid_test_df["id"])

# Faire les prédictions avec chaque modèle expert
print("\nMaking predictions on the test set...")
for prop, model in best_estimators_overall.items():
    # Assurez-vous que l'ordre des index est conservé pour la fusion
    prop_predictions = pd.Series(model.predict(X_test), index=valid_test_df.index)
    print(prop_predictions.head())
    submission_df = submission_df.merge(prop_predictions.rename(prop), left_on="id", right_index=True, how="left")

# # Remplir les NaN pour les SMILES invalides qui n'ont pas été traités
# for prop in properties:
#     if prop not in submission_df.columns:
#         submission_df[prop] = np.nan # Ou une valeur par défaut si spécifié par la compétition
#     submission_df[prop] = submission_df[prop].fillna(0) # Remplir les NaN avec 0 pour les SMILES invalides ou manquants

# submission_df.to_csv("submission.csv", index=False)
# print("\nFichier de soumission 'submission.csv' créé avec succès !")
# print(submission_df.head())


Generating features for the test set...


100%|██████████| 3/3 [00:00<00:00, 17.39it/s]


Making predictions on the test set...
0     91.097856
1    150.834429
2     90.919352
dtype: float64





0    0.359745
1    0.379493
2    0.372142
dtype: float64
0    0.254496
1    0.265201
2    0.315651
dtype: float64
0    1.192751
1    1.155154
2    1.104512
dtype: float64
0    19.468094
1    17.211344
2    17.540523
dtype: float64
