<a href="https://colab.research.google.com/github/Creatur3245/.isort.cfg/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Notebook-style execution for Kaggle-compatible EDA + Baseline + XGBoost (with fallbacks)
# This will run in the visible python environment and display outputs for the user.
# It attempts to load data from /kaggle/input; if not found, it creates a synthetic dataset
# with similar size and structure so the rest of the pipeline can be demonstrated.
# Plots use matplotlib (no seaborn) as required by environment rules.
# The final outputs (models, summary CSV) will be saved to /mnt/data and links provided.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import joblib
import warnings
warnings.filterwarnings("ignore")

print("Listing files under /kaggle/input (if any):")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load the competition data files
try:
    train_df = pd.read_csv('/kaggle/input/melting-point/train.csv')
    test_df = pd.read_csv('/kaggle/input/melting-point/test.csv')
    sample_submission_df = pd.read_csv('/kaggle/input/melting-point/sample_submission.csv')
    print("\nCompetition data loaded successfully.")
except FileNotFoundError:
    print("\nCompetition data not found in /kaggle/input. Generating a synthetic dataset for demonstration.")
    rng = np.random.default_rng(42)
    n_total = 3328
    n_features = 50
    # Create synthetic continuous features and a target Tm with non-linear interactions
    X = rng.normal(size=(n_total, n_features))
    # Create non-linear target with groups to mimic chemical families
    families = rng.integers(0, 10, size=n_total)  # 10 families
    coef = rng.normal(scale=2.0, size=n_features)
    Tm = (X @ coef) + (families * 3.5) + rng.normal(scale=5.0, size=n_total)
    train_df = pd.DataFrame(X, columns=[f"feat_{i}" for i in range(n_features)])
    train_df["family"] = families
    train_df["Tm"] = Tm
    # Create a synthetic test set and sample submission
    X_test_synth = rng.normal(size=(int(n_total*0.2), n_features))
    test_df = pd.DataFrame(X_test_synth, columns=[f"feat_{i}" for i in range(n_features)])
    test_df["family"] = rng.integers(0, 10, size=int(n_total*0.2))
    sample_submission_df = pd.DataFrame({'id': range(len(test_df)), 'Tm': np.zeros(len(test_df))})
    print("Synthetic dataset created.")

# Combine train and test for EDA and feature engineering (if needed later)
df = pd.concat([train_df.drop('Tm', axis=1), test_df.drop('id', axis=1)], ignore_index=True)
df['Tm'] = pd.concat([train_df['Tm'], pd.Series([np.nan]*len(test_df))], ignore_index=True)


print("\nTrain Dataframe head:")
display(train_df.head())
print("\nTest Dataframe head:")
display(test_df.head())
print("\nSample Submission head:")
display(sample_submission_df.head())

print("\nBasic info (Train):")
print(train_df.info())
print("\nBasic info (Test):")
print(test_df.info())


# Quick EDA numbers (using train_df)
n_rows_train, n_cols_train = train_df.shape
n_rows_test, n_cols_test = test_df.shape
print(f"\nTrain Rows: {n_rows_train}, Train Columns: {n_cols_train}")
print(f"Test Rows: {n_rows_test}, Test Columns: {n_cols_test}")


print("\nMissing values per column (Train - top 10):")
print(train_df.isna().sum().sort_values(ascending=False).head(10))

# Target distribution plot (using train_df)
plt.figure(figsize=(6,4))
plt.hist(train_df["Tm"].values, bins=40)
plt.title("Distribution of Tm (melting point) — Madame Ms Strange says: 'Observe the spread'")
plt.xlabel("Tm")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Correlation with target (top 12 - using train_df)
corrs = train_df.corr()["Tm"].abs().sort_values(ascending=False)
print("\nTop correlations with Tm (Train):")
display(corrs.head(12))

# Scatter of top correlated feature vs Tm (using train_df)
# Find the top correlated feature excluding Tm itself
top_feat = corrs.drop('Tm', errors='ignore').index[0]
plt.figure(figsize=(6,4))
plt.scatter(train_df[top_feat], train_df["Tm"], s=6)
plt.title(f"Scatter: {top_feat} vs Tm (Train)")
plt.xlabel(top_feat)
plt.ylabel("Tm")
plt.tight_layout()
plt.show()

# PCA 2D projection of features (excluding Tm, id & non-numeric family if present - using train_df)
feature_cols = [c for c in train_df.columns if c not in ("Tm", "family", "id")]
X_eda = train_df[feature_cols].values
scaler_eda = StandardScaler()
Xs_eda = scaler_eda.fit_transform(X_eda)

pca_eda = PCA(n_components=2)
proj_eda = pca_eda.fit_transform(Xs_eda)

plt.figure(figsize=(6,4))
plt.scatter(proj_eda[:,0], proj_eda[:,1], c=train_df.get("family", pd.Series([0]*len(train_df))), s=8)
plt.title("PCA (2 components) of features — families colored (Train)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()

# Prepare data for modeling
X_train = train_df[feature_cols].values
y_train = train_df["Tm"].values
X_test = test_df[feature_cols].values # Use feature_cols from train for test set as well

print(f"\nTrain size for modeling: {X_train.shape[0]}, Test size for prediction: {X_test.shape[0]}")

# Baseline linear model (LinearRegression)
lr = make_pipeline(StandardScaler(), LinearRegression())
lr.fit(X_train, y_train)
pred_lr_train = lr.predict(X_test) # Predict on the actual test set
# Note: MAE on test set requires true values, which are not available in competition test data.
# We will evaluate on train/validation splits within the training process or predict on the competition test set for submission.
# For demonstration, let's predict on the training set to show MAE.
pred_lr_train_eval = lr.predict(X_train)
mae_lr_train = mean_absolute_error(y_train, pred_lr_train_eval)
print(f"\nBaseline LinearRegression MAE (on Train): {mae_lr_train:.4f}")


# Ridge regression (regularized linear)
ridge = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
ridge.fit(X_train, y_train)
pred_ridge_train = ridge.predict(X_test)
pred_ridge_train_eval = ridge.predict(X_train)
mae_ridge_train = mean_absolute_error(y_train, pred_ridge_train_eval)
print(f"Ridge Regression MAE (on Train): {mae_ridge_train:.4f}")


# SVR (nonlinear baseline) - may be slower; use small sample if too slow
svr = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.5))
try:
    # SVR training can be slow, use a subset if needed
    if len(X_train) > 5000: # Example threshold
         subset_idx = np.random.choice(len(X_train), 5000, replace=False)
         X_train_subset = X_train[subset_idx]
         y_train_subset = y_train[subset_idx]
         svr.fit(X_train_subset, y_train_subset)
         pred_svr_train_eval = svr.predict(X_train[subset_idx])
         mae_svr_train = mean_absolute_error(y_train[subset_idx], pred_svr_train_eval)
         print(f"SVR MAE (on Train Subset): {mae_svr_train:.4f}")
         pred_svr_train = svr.predict(X_test) # Predict on full test set
    else:
        svr.fit(X_train, y_train)
        pred_svr_train = svr.predict(X_test)
        pred_svr_train_eval = svr.predict(X_train)
        mae_svr_train = mean_absolute_error(y_train, pred_svr_train_eval)
        print(f"SVR MAE (on Train): {mae_svr_train:.4f}")

except Exception as e:
    print("SVR failed or was too slow in this environment:", e)
    pred_svr_train = np.zeros(len(X_test)) # Placeholder


# Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
pred_rf_test = rf.predict(X_test) # Predict on the actual test set
pred_rf_train_eval = rf.predict(X_train)
mae_rf_train = mean_absolute_error(y_train, pred_rf_train_eval)
print(f"RandomForestRegressor MAE (on Train): {mae_rf_train:.4f}")


# Try XGBoost if available
use_xgb = False
try:
    import xgboost as xgb
    use_xgb = True
    xgb_model = xgb.XGBRegressor(n_estimators=300, random_state=42, verbosity=0)
    xgb_model.fit(X_train, y_train)
    pred_xgb_test = xgb_model.predict(X_test) # Predict on the actual test set
    pred_xgb_train_eval = xgb_model.predict(X_train)
    mae_xgb_train = mean_absolute_error(y_train, pred_xgb_train_eval)
    print(f"XGBoost MAE (on Train): {mae_xgb_train:.4f}")
except Exception as e:
    print("XGBoost not available or failed to run here. Skipping XGBoost. Error:", e)
    use_xgb = False
    pred_xgb_test = np.zeros(len(X_test)) # Placeholder


# Feature importances from RandomForest (top 12)
importances = rf.feature_importances_
imp_idx = np.argsort(importances)[::-1][:12]
top_features = [(feature_cols[i], importances[i]) for i in imp_idx]
print("\nTop feature importances (from RandomForest):")
for name, imp in top_features:
    print(f"{name}: {imp:.4f}")

# Save trained models to /mnt/data for Kaggle output download
os.makedirs('/mnt/data/models', exist_ok=True)
joblib.dump(lr, '/mnt/data/models/linear_baseline.joblib')
joblib.dump(ridge, '/mnt/data/models/ridge_baseline.joblib')
joblib.dump(rf, '/mnt/data/models/rf_baseline.joblib')
if use_xgb:
    joblib.dump(xgb_model, '/mnt/data/models/xgb_baseline.joblib')

# Produce submission CSV - This will be overwritten by the tuning cell later
# submission_df = sample_submission_df.copy()
# submission_df['Tm'] = pred_rf_test # Using baseline RF predictions initially
# submission_df.to_csv('/mnt/data/submission.csv', index=False)


print("\nSaved models to /mnt/data/models.")
print("Files saved:")
for dirname, _, filenames in os.walk('/mnt/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Final small diagnostic plot (Actual vs Predicted on TRAIN set for best model among RF and XGB if available)
# We plot against the training set as we don't have true labels for the competition test set.
best_pred_eval = pred_rf_train_eval
best_name = "RandomForest"
best_mae_eval = mae_rf_train
if use_xgb and mae_xgb_train < best_mae_eval:
    best_pred_eval = pred_xgb_train_eval
    best_name = "XGBoost"
    best_mae_eval = mae_xgb_train

plt.figure(figsize=(6,4))
plt.scatter(y_train, best_pred_eval, s=8)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.title(f"Actual vs Predicted — {best_name} (MAE={best_mae_eval:.3f}) on Train Set")
plt.xlabel("Actual Tm (Train)")
plt.ylabel("Predicted Tm (Train)")
plt.tight_layout()
plt.show()

# Cross-validated MAE (K-Fold) for RandomForest as a robust estimate (on Train set)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Check if 'family' column exists and has multiple unique values for GroupKFold
if 'family' in train_df.columns and len(train_df['family'].unique()) > 1:
     cv_strategy = GroupKFold(n_splits=5)
     groups_train = train_df['family']
     print("\nUsing GroupKFold for cross-validation.")
else:
     cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
     groups_train = None
     print("\nUsing KFold for cross-validation.")


cv_scores = -cross_val_score(RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1),
                            X_train, y_train, scoring='neg_mean_absolute_error', cv=cv_strategy, groups=groups_train, n_jobs=-1)
print(f"\n5-fold CV RandomForest MAE (on Train): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Save a brief run summary text
summary_text = f"""
Run summary — Madame Ms Strange oracle voice:
Train Rows: {n_rows_train}, Test Rows: {n_rows_test}, Features: {len(feature_cols)}.
Baseline Linear MAE (Train): {mae_lr_train:.4f}
Ridge MAE (Train): {mae_ridge_train:.4f}
RandomForest MAE (Train): {mae_rf_train:.4f}
{'XGBoost MAE (Train): {:.4f}'.format(mae_xgb_train) if use_xgb else 'XGBoost: not available in this environment.'}

Artifacts saved to /mnt/data/models and /mnt/data/submission.csv (generated by tuning cell)
"""
with open('/mnt/data/run_summary.txt', 'w') as f:
    f.write(summary_text)

print("\nRun summary saved to /mnt/data/run_summary.txt")
print("\nNotebook execution complete. Look for outputs in /mnt/data/.")

In [21]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
import joblib, warnings
warnings.filterwarnings("ignore")

# --- Load dataset from /kaggle/input ---
try:
    train_df = pd.read_csv('/kaggle/input/melting-point/train.csv')
    test_df = pd.read_csv('/kaggle/input/melting-point/test.csv')
    sample_submission_df = pd.read_csv('/kaggle/input/melting-point/sample_submission.csv')
    print("Competition data loaded successfully.")
except FileNotFoundError:
    print("Competition data not found in /kaggle/input. Using synthetic dataset if available from previous cell.")
    # Assuming synthetic data was generated and 'df' exists from a previous cell
    if 'df' not in locals():
         print("No competition data and no synthetic data ('df') found. Please run the previous cell.")
         # Exit or handle this case appropriately
    else:
         # If synthetic data 'df' exists, split it into train/test for tuning
         # This split will be different from the main EDA cell's split, but serves for tuning demonstration
         rng = np.random.default_rng(42)
         n_total = len(df)
         n_train = len(df) - len(test_df) # Assuming test_df was created synthetic with the rest of df
         train_df = df.iloc[:n_train].copy()
         test_df = df.iloc[n_train:].copy()
         test_df['id'] = range(len(test_df)) # Add dummy id for synthetic test
         sample_submission_df = pd.DataFrame({'id': test_df['id'], 'Tm': np.zeros(len(test_df))})
         print("Using synthetic data split for tuning.")


# --- Assume 'Tm' is the target ---
target = "Tm"
assert target in train_df.columns, f"Expected column '{target}' not found in training dataset!"
feature_cols = [c for c in train_df.columns if c != target and train_df[c].dtype != 'object' and c != 'id']

X_train = train_df[feature_cols].values
y_train = train_df[target].values
X_test = test_df[feature_cols].values # Use feature_cols from train for test set

# Determine groups for GroupKFold if 'family' exists and has multiple unique values in train_df
if 'family' in train_df.columns and len(train_df['family'].unique()) > 1:
    groups_train = train_df['family'].values
    cv = GroupKFold(n_splits=5)
    print("Using GroupKFold for cross-validation.")
else:
    groups_train = None
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    print("Using KFold for cross-validation.")


mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# --- RandomForest tuning ---
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
param_dist_rf = {
    "n_estimators": [100, 300, 500, 800],
    "max_depth": [None, 10, 20, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}
search_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf,
    n_iter=25, scoring=mae_scorer, cv=cv, n_jobs=-1, random_state=42, verbose=1
)

# Fit with groups if GroupKFold is used
if isinstance(cv, GroupKFold):
    search_rf.fit(X_train, y_train, groups=groups_train)
else:
    search_rf.fit(X_train, y_train)

best_rf = search_rf.best_estimator_
print("Best RF params:", search_rf.best_params_)
print("Best RF CV score (MAE):", -search_rf.best_score_)

# --- Evaluate on train set (since test set true labels are not available) ---
pred_rf_train_eval = best_rf.predict(X_train)
mae_rf_train = mean_absolute_error(y_train, pred_rf_train_eval)
print("Tuned RF Train MAE:", mae_rf_train)

# Predict on the actual competition test set
pred_rf_test = best_rf.predict(X_test)


# --- Save model ---
os.makedirs("/kaggle/working/models", exist_ok=True)
joblib.dump(best_rf, "/kaggle/working/models/rf_tuned.joblib")

# --- Try XGBoost if available ---
use_xgb = False
try:
    import xgboost as xgb
    use_xgb = True
    xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, tree_method='hist') # Use hist for potentially faster training
    param_dist_xgb = {
        "n_estimators": [200, 500, 800],
        "max_depth": [3, 6, 10],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
    }
    search_xgb = RandomizedSearchCV(
        xgb_model, param_distributions=param_dist_xgb,
        n_iter=25, scoring=mae_scorer, cv=cv, n_jobs=-1, random_state=42, verbose=1
    )
    # Fit with groups if GroupKFold is used
    if isinstance(cv, GroupKFold):
        search_xgb.fit(X_train, y_train, groups=groups_train)
    else:
        search_xgb.fit(X_train, y_train)

    best_xgb = search_xgb.best_estimator_
    print("Best XGB params:", search_xgb.best_params_)
    print("Best XGB CV score (MAE):", -search_xgb.best_score_)

    # Evaluate on train set
    pred_xgb_train_eval = best_xgb.predict(X_train)
    mae_xgb_train = mean_absolute_error(y_train, pred_xgb_train_eval)
    print("Tuned XGB Train MAE:", mae_xgb_train)

    # Predict on the actual competition test set
    pred_xgb_test = best_xgb.predict(X_test)

    joblib.dump(best_xgb, "/kaggle/working/models/xgb_tuned.joblib")
except Exception as e:
    print("XGBoost tuning skipped:", e)
    use_xgb = False # Ensure use_xgb is False if import fails
    pred_xgb_test = np.zeros(len(X_test)) # Placeholder


# --- Generate sample submission file ---
# Using the best model's predictions on the test set (based on CV score)
best_model_name = "RandomForest"
best_preds = pred_rf_test
best_cv_mae = -search_rf.best_score_

if use_xgb and (-search_xgb.best_score_ < best_cv_mae):
    best_model_name = "XGBoost"
    best_preds = pred_xgb_test
    best_cv_mae = -search_xgb.best_score_

submission_df = sample_submission_df.copy()
submission_df['Tm'] = best_preds # Use predictions on the competition test set
submission_df.to_csv('/kaggle/working/submission.csv', index=False)


print(f"\nSaved tuned models to /kaggle/working/models and a sample submission.csv (based on {best_model_name}) to /kaggle/working.")
print("Files saved in /kaggle/working:")
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

No CSV found under /kaggle/input — generating a synthetic dataset for demonstration.
Synthetic dataset created.
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best RF params: {'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None}
Best RF CV score (MAE): 10.505212037988656
Tuned RF Test MAE: 9.62203938266791
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best XGB params: {'subsample': 0.7, 'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Best XGB CV score (MAE): 6.70125396186482
Tuned XGB Test MAE: 5.497477001896453

Saved tuned models to /kaggle/working/models and a sample submission.csv (based on XGBoost) to /kaggle/working.
Files saved in /kaggle/working:
/kaggle/working/submission.csv
/kaggle/working/models/rf_tuned.joblib
/kaggle/working/models/xgb_tuned.joblib


In [22]:
!pip kaggle competitions download -c melting-point

ERROR: unknown command "kaggle"


In [None]:
!pip install optuna --quiet

import optuna
from sklearn.model_selection import cross_val_score, GroupKFold, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
import xgboost as xgb

# --- CV setup ---
if groups is not None:
    cv = GroupKFold(n_splits=5)
    group_train = train_df['family']
else:
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    group_train = None

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# ---------------------------
# RandomForest Optuna Tuning
# ---------------------------
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42,
        "n_jobs": -1,
    }
    model = RandomForestRegressor(**params)
    if groups is not None:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, groups=group_train, n_jobs=-1)
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, n_jobs=-1)
    return -scores.mean()

study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(objective_rf, n_trials=50, show_progress_bar=True)

print("Best RF Params:", study_rf.best_params)
print("Best RF CV MAE:", study_rf.best_value)

best_rf = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
pred_rf = best_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, pred_rf)
print("Optuna RF Test MAE:", mae_rf)

joblib.dump(best_rf, "/kaggle/working/models/rf_optuna.joblib")

# ---------------------------
# XGBoost Optuna Tuning
# ---------------------------
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "n_jobs": -1,
    }
    model = xgb.XGBRegressor(**params)
    if groups is not None:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, groups=group_train, n_jobs=-1)
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, n_jobs=-1)
    return -scores.mean()

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print("Best XGB Params:", study_xgb.best_params)
print("Best XGB CV MAE:", study_xgb.best_value)

best_xgb = xgb.XGBRegressor(**study_xgb.best_params, random_state=42, n_jobs=-1)
best_xgb.fit(X_train, y_train)
pred_xgb = best_xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, pred_xgb)
print("Optuna XGB Test MAE:", mae_xgb)

joblib.dump(best_xgb, "/kaggle/working/models/xgb_optuna.joblib")


[I 2025-10-02 20:05:57,257] A new study created in memory with name: no-name-baef1c14-84a8-495a-9f74-f69695462d10


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-02 20:07:01,974] Trial 0 finished with value: 10.632652465507961 and parameters: {'n_estimators': 943, 'max_depth': 22, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 10.632652465507961.
[I 2025-10-02 20:07:43,665] Trial 1 finished with value: 11.081403315870672 and parameters: {'n_estimators': 862, 'max_depth': 18, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 0 with value: 10.632652465507961.
[I 2025-10-02 20:07:55,810] Trial 2 finished with value: 10.781309932316478 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 10.632652465507961.
[I 2025-10-02 20:08:02,625] Trial 3 finished with value: 10.950743424502171 and parameters: {'n_estimators': 112, 'max_depth': 10, 'min_samples_split': 12, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 10.63265246550

In [None]:
!pip install optuna --quiet

import optuna
from optuna.pruners import MedianPruner
from sklearn.model_selection import cross_val_score, GroupKFold, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
import xgboost as xgb
import joblib

# --- CV setup ---
if groups is not None:
    cv = GroupKFold(n_splits=5)
    group_train = train_df['family']
else:
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    group_train = None

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# ---------------------------
# RandomForest with Optuna + Pruning
# ---------------------------
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42,
        "n_jobs": -1,
    }
    model = RandomForestRegressor(**params)
    if groups is not None:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, groups=group_train, n_jobs=-1)
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, n_jobs=-1)
    score = -scores.mean()

    # Report to Optuna and prune if bad
    trial.report(score, step=0)
    if trial.should_prune():
        raise optuna.TrialPruned()
    return score

study_rf = optuna.create_study(direction="minimize", pruner=MedianPruner())
study_rf.optimize(objective_rf, n_trials=50, show_progress_bar=True)

print("Best RF Params:", study_rf.best_params)
print("Best RF CV MAE:", study_rf.best_value)

best_rf = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
pred_rf = best_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, pred_rf)
print("Optuna RF Test MAE:", mae_rf)

joblib.dump(best_rf, "/kaggle/working/models/rf_optuna_pruned.joblib")

# ---------------------------
# XGBoost with Optuna + Pruning
# ---------------------------
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "n_jobs": -1,
    }
    model = xgb.XGBRegressor(**params, tree_method="hist", eval_metric="mae")

    # Optuna pruning via callbacks (XGBoost integration)
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-mae")

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False,
        callbacks=[pruning_callback]
    )
    pred = model.predict(X_valid)
    score = mean_absolute_error(y_valid, pred)
    return score

# Split train into train/valid for pruning
from sklearn.model_selection import train_test_split
X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

study_xgb = optuna.create_study(direction="minimize", pruner=MedianPruner())
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print("Best XGB Params:", study_xgb.best_params)
print("Best XGB CV MAE:", study_xgb.best_value)

best_xgb = xgb.XGBRegressor(**study_xgb.best_params, random_state=42, n_jobs=-1, tree_method="hist")
best_xgb.fit(X_train, y_train)
pred_xgb = best_xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, pred_xgb)
print("Optuna XGB Test MAE:", mae_xgb)

joblib.dump(best_xgb, "/kaggle/working/models/xgb_optuna_pruned.joblib")


In [None]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate
from optuna.visualization import plot_contour, plot_slice

# ---------------------------
# RandomForest Optuna Plots
# ---------------------------
print("📊 RandomForest Study Visualization")

fig1 = plot_optimization_history(study_rf)
fig1.show()

fig2 = plot_param_importances(study_rf)
fig2.show()

fig3 = plot_parallel_coordinate(study_rf)
fig3.show()

fig4 = plot_contour(study_rf)
fig4.show()

fig5 = plot_slice(study_rf)
fig5.show()

# ---------------------------
# XGBoost Optuna Plots
# ---------------------------
print("📊 XGBoost Study Visualization")

fig6 = plot_optimization_history(study_xgb)
fig6.show()

fig7 = plot_param_importances(study_xgb)
fig7.show()

fig8 = plot_parallel_coordinate(study_xgb)
fig8.show()

fig9 = plot_contour(study_xgb)
fig9.show()

fig10 = plot_slice(study_xgb)
fig10.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ---------------------------
# Compare MAE (RF vs XGB)
# ---------------------------
model_names = ["RandomForest", "XGBoost"]
mae_scores = [mae_rf, mae_xgb]

plt.figure(figsize=(8,5))
sns.barplot(x=model_names, y=mae_scores, palette="viridis")
plt.title("🔮 Madame Ms Strange Showdown: RF vs XGB", fontsize=14)
plt.ylabel("Mean Absolute Error (MAE)")
for i, score in enumerate(mae_scores):
    plt.text(i, score+0.2, f"{score:.3f}", ha='center', fontsize=12, fontweight="bold")
plt.show()

# ---------------------------
# Residual Distribution
# ---------------------------
residuals_rf = y_test - pred_rf
residuals_xgb = y_test - pred_xgb

plt.figure(figsize=(10,6))
sns.kdeplot(residuals_rf, shade=True, label="RandomForest Residuals", color="darkblue")
sns.kdeplot(residuals_xgb, shade=True, label="XGBoost Residuals", color="darkgreen")
plt.axvline(0, color="black", linestyle="--")
plt.title("Residual Distribution 🌌 (RF vs XGB)", fontsize=14)
plt.xlabel("Prediction Error")
plt.legend()
plt.show()

# ---------------------------
# Scatter: True vs Predicted
# ---------------------------
plt.figure(figsize=(10,6))
plt.scatter(y_test, pred_rf, alpha=0.4, label="RF Predictions", color="navy")
plt.scatter(y_test, pred_xgb, alpha=0.4, label="XGB Predictions", color="teal")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.title("True vs Predicted 🔮 (RF vs XGB)", fontsize=14)
plt.xlabel("True Melting Point (Tm)")
plt.ylabel("Predicted Tm")
plt.legend()
plt.show()

# ---------------------------
# Print final cosmic verdict
# ---------------------------
print("⚖️ Cosmic Model Verdict")
print(f"RandomForest Optuna MAE: {mae_rf:.4f}")
print(f"XGBoost Optuna MAE: {mae_xgb:.4f}")

if mae_rf < mae_xgb:
    print("✨ RandomForest holds the throne in this dimension.")
else:
    print("✨ XGBoost reigns supreme across the melting multiverse.")


In [None]:
!pip install shap --quiet
import shap

# Use a subset of test data for faster SHAP
X_sample = X_test[:200]

# ---------------------------
# SHAP for RandomForest
# ---------------------------
explainer_rf = shap.TreeExplainer(best_rf)
shap_values_rf = explainer_rf.shap_values(X_sample)

plt.title("🔮 SHAP Summary: RandomForest")
shap.summary_plot(shap_values_rf, X_sample, plot_type="bar", show=True)

shap.summary_plot(shap_values_rf, X_sample, show=True)

# ---------------------------
# SHAP for XGBoost
# ---------------------------
explainer_xgb = shap.TreeExplainer(best_xgb)
shap_values_xgb = explainer_xgb.shap_values(X_sample)

plt.title("🔮 SHAP Summary: XGBoost")
shap.summary_plot(shap_values_xgb, X_sample, plot_type="bar", show=True)

shap.summary_plot(shap_values_xgb, X_sample, show=True)

# ---------------------------
# Force Plot Example (Single Prediction)
# ---------------------------
idx = 5  # pick one test sample
shap.initjs()
shap.force_plot(
    explainer_xgb.expected_value,
    shap_values_xgb[idx,:],
    X_sample.iloc[idx,:],
    matplotlib=True
)


In [None]:
# ===========================
# 🔮 Madame Ms Strange EDA — Full Cell
# This cell contains visualizations and analysis to compare the performance and interpret the predictions of the tuned RandomForest and XGBoost models.
# ===========================
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import numpy as np
import pandas as pd

# Assuming mae_rf, mae_xgb, y_test, pred_rf, pred_xgb, best_rf, best_xgb, X_test are available from previous cells.
# If not, ensure those cells are run first.

# ---------------------------
# 1. Compare MAE (RF vs XGB)
# Visualize the Mean Absolute Error of the two models.
# ---------------------------
model_names = ["RandomForest", "XGBoost"]
mae_scores = [mae_rf, mae_xgb]

plt.figure(figsize=(8,5))
sns.barplot(x=model_names, y=mae_scores, palette="viridis")
plt.title("🔮 Madame Ms Strange Showdown: RF vs XGB - Mean Absolute Error Comparison", fontsize=14)
plt.ylabel("Mean Absolute Error (MAE)")
plt.xlabel("Model")
for i, score in enumerate(mae_scores):
    plt.text(i, score+0.01, f"{score:.3f}", ha='center', fontsize=12, fontweight="bold") # Adjusted text position slightly
plt.show()

# ---------------------------
# 2. Residual Distribution
# Plot the distribution of prediction errors (residuals).
# A good model should have residuals centered around zero.
# ---------------------------
residuals_rf = y_test - pred_rf
residuals_xgb = y_test - pred_xgb

plt.figure(figsize=(10,6))
sns.kdeplot(residuals_rf, shade=True, label="RandomForest Residuals", color="darkblue")
sns.kdeplot(residuals_xgb, shade=True, label="XGBoost Residuals", color="darkgreen")
plt.axvline(0, color="black", linestyle="--", label="Zero Error")
plt.title("Residual Distribution 🌌 (RF vs XGB)", fontsize=14)
plt.xlabel("Prediction Error")
plt.ylabel("Density")
plt.legend()
plt.show()

# ---------------------------
# 3. Scatter: True vs Predicted
# Visualize the relationship between actual and predicted values.
# Points should ideally lie along the diagonal line.
# ---------------------------
plt.figure(figsize=(10,6))
plt.scatter(y_test, pred_rf, alpha=0.4, label="RF Predictions", color="navy", s=15) # Reduced point size
plt.scatter(y_test, pred_xgb, alpha=0.4, label="XGB Predictions", color="teal", s=15) # Reduced point size
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label="Ideal Prediction")
plt.title("True vs Predicted Tm 🔮 (RF vs XGB)", fontsize=14)
plt.xlabel("True Melting Point (Tm)")
plt.ylabel("Predicted Tm")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6) # Added grid
plt.show()

# ---------------------------
# 4. SHAP Interpretability
# Use SHAP values to explain the model predictions.
# This section visualizes feature importance and impact.
# ---------------------------
print("\n--- SHAP Analysis ---")
# Subset sample for SHAP for faster computation
# Ensure X_test is a pandas DataFrame with feature names for SHAP plots
if not isinstance(X_test, pd.DataFrame):
    # Assuming feature_cols are available from previous cells
    X_test_df = pd.DataFrame(X_test, columns=feature_cols)
    X_sample = X_test_df.sample(n=min(200, len(X_test_df)), random_state=42).copy() # Use sample instead of slicing
else:
    X_sample = X_test.sample(n=min(200, len(X_test)), random_state=42).copy() # Use sample instead of slicing

# ---- Grouping features into interpretable families for better visualization
# Assuming original feature names are feat_0, feat_1, ... feat_49 and 'family'
feature_groups = {
    "Atomic_Descriptors": [f"feat_{i}" for i in range(0,10)],
    "Bonding_Descriptors": [f"feat_{i}" for i in range(10,20)],
    "Electronic_Descriptors": [f"feat_{i}" for i in range(20,30)],
    "Topological_Descriptors": [f"feat_{i}" for i in range(30,40)],
    "Other_Descriptors": [f"feat_{i}" for i in range(40,50)],
    "Family_ID": ["family"] # Include 'family' if it's a feature used in the model
}

# Create a mapping from original feature names to group names
feature_to_group = {}
for group, cols in feature_groups.items():
    for col in cols:
        feature_to_group[col] = group

# Rename columns in the sample DataFrame based on the grouping
# Handle potential missing 'family' column if it wasn't used as a feature
if 'family' in X_sample.columns and 'family' not in feature_to_group:
     feature_to_group['family'] = 'Family_ID' # Add 'family' to mapping if present in data but not in initial groups

X_sample_grouped_cols = []
for col in X_sample.columns:
    X_sample_grouped_cols.append(feature_to_group.get(col, col)) # Use original name if not in mapping

X_sample_grouped = X_sample.copy()
X_sample_grouped.columns = X_sample_grouped_cols


# ---- SHAP for RandomForest
print("\nCalculating SHAP values for RandomForest...")
try:
    explainer_rf = shap.TreeExplainer(best_rf)
    shap_values_rf = explainer_rf.shap_values(X_sample) # Calculate SHAP on original feature names
    # Map SHAP values back to grouped feature names for plotting
    shap_values_rf_grouped = pd.DataFrame(shap_values_rf, columns=X_sample.columns)
    shap_values_rf_grouped.columns = X_sample_grouped.columns.tolist()
    shap_values_rf_grouped = shap_values_rf_grouped.groupby(level=0, axis=1).sum() # Sum SHAP values for features in the same group

    plt.figure(figsize=(10,6))
    plt.title("🔮 SHAP Summary: RandomForest (Grouped Features - Bar Plot)", fontsize=14)
    shap.summary_plot(shap_values_rf_grouped, X_sample_grouped, plot_type="bar", show=False) # Use show=False to control figure display
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10,8))
    plt.title("🔮 SHAP Summary: RandomForest (Grouped Features - Dot Plot)", fontsize=14)
    shap.summary_plot(shap_values_rf_grouped, X_sample_grouped, show=False) # Use show=False
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"SHAP for RandomForest failed: {e}")


# ---- SHAP for XGBoost
print("\nCalculating SHAP values for XGBoost...")
try:
    explainer_xgb = shap.TreeExplainer(best_xgb)
    shap_values_xgb = explainer_xgb.shap_values(X_sample) # Calculate SHAP on original feature names
    # Map SHAP values back to grouped feature names for plotting
    shap_values_xgb_grouped = pd.DataFrame(shap_values_xgb, columns=X_sample.columns)
    shap_values_xgb_grouped.columns = X_sample_grouped.columns.tolist()
    shap_values_xgb_grouped = shap_values_xgb_grouped.groupby(level=0, axis=1).sum() # Sum SHAP values for features in the same group

    plt.figure(figsize=(10,6))
    plt.title("🔮 SHAP Summary: XGBoost (Grouped Features - Bar Plot)", fontsize=14)
    shap.summary_plot(shap_values_xgb_grouped, X_sample_grouped, plot_type="bar", show=False) # Use show=False
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10,8))
    plt.title("🔮 SHAP Summary: XGBoost (Grouped Features - Dot Plot)", fontsize=14)
    shap.summary_plot(shap_values_xgb_grouped, X_sample_grouped, show=False) # Use show=False
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"SHAP for XGBoost failed: {e}")


# ---- SHAP Force Plot Example (Single Prediction)
# Choose an index for the force plot. Ensure the index is within the bounds of X_sample.
if len(X_sample) > 0:
    idx = 0 # Use the first sample from the subset
    print(f"\nGenerating SHAP Force Plot for sample index {idx}...")
    try:
        # Need to calculate SHAP values for the specific instance using the original explainer
        shap_values_xgb_single = explainer_xgb.shap_values(X_sample.iloc[idx,:])
        shap.initjs() # Initialize JS for interactive plot
        # Use the original X_sample row for the force plot, but the grouped column names for display
        force_plot_data = X_sample.iloc[idx,:].copy()
        force_plot_data.index = X_sample_grouped.columns # Use grouped names for display
        shap.force_plot(
            explainer_xgb.expected_value,
            shap_values_xgb_single,
            force_plot_data,
            matplotlib=True, # Use matplotlib for static rendering in some environments
            show=True # Show the plot
        )
    except Exception as e:
        print(f"SHAP Force Plot failed: {e}")
else:
    print("\nCannot generate SHAP Force Plot: X_sample is empty.")


# ---------------------------
# 5. Cosmic Verdict
# Print a summary of the model performance comparison.
# ---------------------------
print("\n--- ⚖️ Cosmic Model Verdict ---")
print(f"RandomForest Optuna MAE: {mae_rf:.4f}")
print(f"XGBoost Optuna MAE: {mae_xgb:.4f}")

if mae_rf < mae_xgb:
    print("✨ RandomForest holds the throne in this dimension (lower MAE).")
elif mae_xgb < mae_rf:
    print("✨ XGBoost reigns supreme across the melting multiverse (lower MAE).")
else:
    print("⚖️ Both models are equally balanced in their predictive power.")

In [None]:
!pip install optuna-integration[xgboost]

# Task
Analyze the provided melting point dataset ("train.csv", "test.csv", "sample_submission.csv"), perform data preprocessing, feature engineering, hyperparameter tuning using Optuna for RandomForest and XGBoost, ensemble modeling, SHAP analysis, add multi-language plot annotations, and generate a submission file in the format of "sample_submission.csv". Ensure the notebook is concise, well-labeled, and includes relevant plots and graphs.

## Hyperparameter tuning with optuna

### Subtask:
Implement more extensive hyperparameter tuning for both RandomForest and XGBoost using Optuna.


**Reasoning**:
The previous code cells have already set up the environment, loaded the data, and performed some initial EDA and baseline modeling. The current subtask is to perform more extensive hyperparameter tuning using Optuna. The existing code cell `n6mjDJxgJuTk` already contains the basic structure for Optuna tuning for both RandomForest and XGBoost. I will modify and extend this cell to perform more trials and ensure the best models are saved. I will also ensure the `y_test` variable is not used in the objective function as it is not available during training/tuning. The objective function should return the cross-validation score on the training data.



In [None]:
# Extend the Optuna tuning cell (n6mjDJxgJuTk)

# --- CV setup ---
# Ensure X_train, y_train, X_test, train_df, test_df, groups are available from previous cells.
# If 'family' column exists and has multiple unique values in train_df, use GroupKFold.
if 'family' in train_df.columns and len(train_df['family'].unique()) > 1:
    cv = GroupKFold(n_splits=5)
    group_train = train_df['family'].values
    print("Using GroupKFold for cross-validation.")
else:
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    group_train = None
    print("Using KFold for cross-validation.")

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# ---------------------------
# RandomForest Optuna Tuning
# ---------------------------
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1500), # Increased range
        "max_depth": trial.suggest_int("max_depth", 5, 60), # Increased range
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 25), # Increased range
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 15), # Increased range
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None, 0.5, 0.7, 0.9]), # Added float options
        "random_state": 42,
        "n_jobs": -1,
    }
    model = RandomForestRegressor(**params)
    if group_train is not None:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, groups=group_train, n_jobs=-1)
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, n_jobs=-1)
    return -scores.mean() # Return negative MAE for minimization

print("\nStarting Optuna tuning for RandomForest...")
study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(objective_rf, n_trials=150, show_progress_bar=True) # Increased trials

print("\nBest RF Params:", study_rf.best_params)
print("Best RF CV MAE:", study_rf.best_value)

# ---------------------------
# XGBoost Optuna Tuning
# ---------------------------
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1500), # Increased range
        "max_depth": trial.suggest_int("max_depth", 3, 15), # Increased range
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True), # Adjusted range
        "subsample": trial.suggest_float("subsample", 0.6, 1.0), # Adjusted range
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0), # Adjusted range
        "gamma": trial.suggest_float("gamma", 0, 10), # Increased range
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 15), # Increased range
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True), # Added L1 regularization
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True), # Added L2 regularization
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": 'hist' # Use hist for potentially faster training
    }
    model = xgb.XGBRegressor(**params)
    if group_train is not None:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, groups=group_train, n_jobs=-1)
    else:
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=mae_scorer, n_jobs=-1)
    return -scores.mean() # Return negative MAE for minimization

print("\nStarting Optuna tuning for XGBoost...")
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=150, show_progress_bar=True) # Increased trials

print("\nBest XGB Params:", study_xgb.best_params)
print("Best XGB CV MAE:", study_xgb.best_value)

# ---------------------------
# Train final models with best params and save
# ---------------------------
print("\nTraining final models with best parameters...")
best_rf = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)

best_xgb = xgb.XGBRegressor(**study_xgb.best_params, random_state=42, n_jobs=-1, tree_method='hist')
best_xgb.fit(X_train, y_train)

# Save models
os.makedirs("/kaggle/working/models", exist_ok=True)
joblib.dump(best_rf, "/kaggle/working/models/rf_optuna_tuned.joblib")
joblib.dump(best_xgb, "/kaggle/working/models/xgb_optuna_tuned.joblib")

print("\nSaved best tuned models to /kaggle/working/models/")