In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

print("Data loaded")

Data loaded


In [3]:
# Prepare data for modeling
print("2. Preparing Data for Modeling")
features = [col for col in test_df.columns if col not in ['id', 'SMILES']]
X = train_df[features].copy()
y = train_df['Tm'].copy()

X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

print(f"Using {X.shape[1]} features for modeling")

2. Preparing Data for Modeling
Using 322 features for modeling


In [5]:
print("3. Setting up Cross Validation and model evaluation")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

def evaluate_model(model_name, model):
    """Function to perform cross validation and return MAE scores"""
    mae_scores = []

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipeline.fit(X_train, y_train)
        y_pred_log = pipeline.predict(X_val)

        y_pred_original = np.exp(y_pred_log)
        y_val_original = np.exp(y_val)

        mae = mean_absolute_error(y_val_original, y_pred_original)
        mae_scores.append(mae)
        print(f"Fold {fold+1} MAE: {mae:.5}")

    mean_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)

    results[model_name] = {'mean_mae': mean_mae, 'std_mae': std_mae}
    print(f"\n{model_name} - Average MAE: {mean_mae:.5f} (+/- {std_mae:.5f})\n")

3. Setting up Cross Validation and model evaluation


In [6]:
print("4. Running Models")

# Model 1: Ridge Regression
print("Evaluating Ridge")
evaluate_model('Ridge', Ridge(alpha=1.0, random_state=42))

# Model 2: Random Forest 
print("Evaluating RandomForestRegressor")
evaluate_model('RandomForest', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))

# Model 3: LightGBM
print("Evaluating LightGBM")
evaluate_model('LightGBM', lgb.LGBMRegressor(random_state=42, n_jobs=-1))

# Model 4: XGBoost 
print("Evaluating XGBoost")
evaluate_model('XGBoost', xgb.XGBRegressor(random_state=42, n_jobs=-1))

print("--- Final Results Summary ---")
results_df = pd.DataFrame(results).T.sort_values(by='mean_mae')
print(results_df)

4. Running Models
Evaluating Ridge
Fold 1 MAE: 35.86
Fold 2 MAE: 32.662
Fold 3 MAE: 32.669
Fold 4 MAE: 35.291
Fold 5 MAE: 36.343

Ridge - Average MAE: 34.56499 (+/- 1.58619)

Evaluating RandomForestRegressor
Fold 1 MAE: 31.295
Fold 2 MAE: 30.422
Fold 3 MAE: 29.607
Fold 4 MAE: 31.748
Fold 5 MAE: 30.4

RandomForest - Average MAE: 30.69453 (+/- 0.75008)

Evaluating LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18871
[LightGBM] [Info] Number of data points in the train set: 2129, number of used features: 247
[LightGBM] [Info] Start training from score 5.584540
Fold 1 MAE: 28.995
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003702 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18886
[LightGBM] [Info] Number of data points in the train set: 2129, num