In [None]:
# Make and test a CatBoost model

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
main_data = pd.read_csv("./data/train.csv")


# 'critical_temp' is the target
X = main_data.drop("critical_temp", axis=1)
y = main_data["critical_temp"]

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline CatBoost model


catboost_model = CatBoostRegressor(
    iterations=500,             # Number of boosting rounds (trees)
    learning_rate=0.03,         # Step size for updating weights; lower values usually require more iterations but can lead to better generalization
    depth=6,                    # Maximum depth of the trees; controls the model’s complexity
    l2_leaf_reg=3,              # L2 regularization coefficient to reduce overfitting
    loss_function='RMSE',       # Loss function to optimize, here RMSE for regression tasks
    random_seed=42,             # Ensures reproducibility
    verbose=100                 # Controls how often training progress is printed (set to 0 to disable)
)


# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model using RMSE and R² metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Base 80/20 CatBoost Performance:")
print("RMSE: {:.4f}".format(rmse))
print("R²: {:.4f}".format(r2))

# Optional Cross-validation to further assess model performance
cv_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -np.mean(cv_scores)
print("Cross-validated RMSE: {:.4f}".format(cv_rmse))


Results with training data, original features:

400:	learn: 10.9660427	total: 6.85s	remaining: 1.69s
499:	learn: 10.4761130	total: 8.51s	remaining: 0us

Base 80/20 CatBoost Performance:

RMSE: 10.9983

R²: 0.8949

Cross-validated RMSE: 11.4960


In [None]:
#made a catboost with all features


# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"
X = merged_data.drop(columns=['critical_temp', 'material'], axis=1)
y = merged_data[target]


# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline CatBoost model


catboost_model = CatBoostRegressor(
    iterations=500,             # Number of boosting rounds (trees)
    learning_rate=0.03,         # Step size for updating weights; lower values usually require more iterations but can lead to better generalization
    depth=6,                    # Maximum depth of the trees; controls the model’s complexity
    l2_leaf_reg=3,              # L2 regularization coefficient to reduce overfitting
    loss_function='RMSE',       # Loss function to optimize, here RMSE for regression tasks
    random_seed=42,             # Ensures reproducibility
    verbose=100                 # Controls how often training progress is printed (set to 0 to disable)
)


# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model using RMSE and R² metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Base 80/20 CatBoost Performance:")
print("RMSE: {:.4f}".format(rmse))
print("R²: {:.4f}".format(r2))

# Optional Cross-validation to further assess model performance
cv_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -np.mean(cv_scores)
print("Cross-validated RMSE: {:.4f}".format(cv_rmse))

Results:


400:	learn: 10.9568660	total: 9.56s	remaining: 2.36s
499:	learn: 10.5263805	total: 12.2s	remaining: 0us

Base 80/20 CatBoost Performance:

RMSE: 10.8753

R²: 0.8973

Cross-validated RMSE: 11.4212

In [None]:
# with all features plus 3 engineered



# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Feature Engineering: Physics-Based Ratio, Thermal Conductivity Transformation, Log transformation
merged_data["mass_density_ratio"] = merged_data["wtd_mean_atomic_mass"] / (merged_data["wtd_mean_Density"] + 1e-9)
merged_data["affinity_valence_ratio"] = merged_data["wtd_mean_ElectronAffinity"] / (merged_data["wtd_mean_Valence"] + 1e-9)
merged_data["log_thermal_conductivity"] = np.log1p(merged_data["range_ThermalConductivity"])

# Define target and features
target = "critical_temp"
X = merged_data.drop(columns=['critical_temp', 'material'], axis=1)
y = merged_data[target]


# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline CatBoost model


catboost_model = CatBoostRegressor(
    iterations=500,             # Number of boosting rounds (trees)
    learning_rate=0.03,         # Step size for updating weights; lower values usually require more iterations but can lead to better generalization
    depth=6,                    # Maximum depth of the trees; controls the model’s complexity
    l2_leaf_reg=3,              # L2 regularization coefficient to reduce overfitting
    loss_function='RMSE',       # Loss function to optimize, here RMSE for regression tasks
    random_seed=42,             # Ensures reproducibility
    verbose=100                 # Controls how often training progress is printed (set to 0 to disable)
)


# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model using RMSE and R² metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Base 80/20 CatBoost Performance:")
print("RMSE: {:.4f}".format(rmse))
print("R²: {:.4f}".format(r2))

# Optional Cross-validation to further assess model performance
cv_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -np.mean(cv_scores)
print("Cross-validated RMSE: {:.4f}".format(cv_rmse))



Results all features (including engineered):

400:	learn: 10.9506255	total: 9.88s	remaining: 2.44s
499:	learn: 10.5096733	total: 12.5s	remaining: 0us

Base 80/20 CatBoost Performance:

RMSE: 10.8899

R²: 0.8970

Cross-validated RMSE: 11.4051

In [None]:
# reduced features (top 99 selected by other model):

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Feature Engineering: Physics-Based Ratio, Thermal Conductivity Transformation, Log transformation
merged_data["mass_density_ratio"] = merged_data["wtd_mean_atomic_mass"] / (merged_data["wtd_mean_Density"] + 1e-9)
merged_data["affinity_valence_ratio"] = merged_data["wtd_mean_ElectronAffinity"] / (merged_data["wtd_mean_Valence"] + 1e-9)
merged_data["log_thermal_conductivity"] = np.log1p(merged_data["range_ThermalConductivity"])

# Define target and features
target = "critical_temp"
features = ['mean_atomic_mass', 'wtd_mean_atomic_mass', 'gmean_atomic_mass',
       'entropy_atomic_mass', 'wtd_entropy_atomic_mass', 'range_atomic_mass',
       'wtd_range_atomic_mass', 'wtd_std_atomic_mass', 'mean_fie',
       'wtd_mean_fie', 'wtd_entropy_fie', 'range_fie', 'wtd_range_fie',
       'wtd_std_fie', 'mean_atomic_radius', 'wtd_mean_atomic_radius',
       'gmean_atomic_radius', 'range_atomic_radius', 'wtd_range_atomic_radius',
       'mean_Density', 'wtd_mean_Density', 'gmean_Density', 'entropy_Density',
       'wtd_entropy_Density', 'range_Density', 'wtd_range_Density',
       'wtd_std_Density', 'mean_ElectronAffinity', 'wtd_mean_ElectronAffinity',
       'gmean_ElectronAffinity', 'wtd_gmean_ElectronAffinity',
       'entropy_ElectronAffinity', 'wtd_entropy_ElectronAffinity',
       'range_ElectronAffinity', 'wtd_range_ElectronAffinity',
       'wtd_std_ElectronAffinity', 'mean_FusionHeat', 'wtd_mean_FusionHeat',
       'gmean_FusionHeat', 'entropy_FusionHeat', 'wtd_entropy_FusionHeat',
       'range_FusionHeat', 'wtd_range_FusionHeat', 'wtd_std_FusionHeat',
       'mean_ThermalConductivity', 'wtd_mean_ThermalConductivity',
       'gmean_ThermalConductivity', 'wtd_gmean_ThermalConductivity',
       'entropy_ThermalConductivity', 'wtd_entropy_ThermalConductivity',
       'range_ThermalConductivity', 'wtd_range_ThermalConductivity',
       'mean_Valence', 'wtd_mean_Valence', 'range_Valence',
       'wtd_range_Valence', 'wtd_std_Valence', 'H', 'B', 'C', 'O', 'F', 'Na',
       'Mg', 'Al', 'Cl', 'K', 'Ca', 'V', 'Cr', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
       'As', 'Se', 'Sr', 'Y', 'Nb', 'Sn', 'I', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
       'Sm', 'Eu', 'Gd', 'Tb', 'Yb', 'Hg', 'Tl', 'Pb', 'Bi',
       'mass_density_ratio', 'affinity_valence_ratio',
       'log_thermal_conductivity']
X = merged_data[features]
y = merged_data[target]


# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a baseline CatBoost model


catboost_model = CatBoostRegressor(
    iterations=500,             # Number of boosting rounds (trees)
    learning_rate=0.03,         # Step size for updating weights; lower values usually require more iterations but can lead to better generalization
    depth=6,                    # Maximum depth of the trees; controls the model’s complexity
    l2_leaf_reg=3,              # L2 regularization coefficient to reduce overfitting
    loss_function='RMSE',       # Loss function to optimize, here RMSE for regression tasks
    random_seed=42,             # Ensures reproducibility
    verbose=100                 # Controls how often training progress is printed (set to 0 to disable)
)


# Fit the model on the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model using RMSE and R² metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Base 80/20 CatBoost Performance:")
print("RMSE: {:.4f}".format(rmse))
print("R²: {:.4f}".format(r2))

# Optional Cross-validation to further assess model performance
cv_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse = -np.mean(cv_scores)
print("Cross-validated RMSE: {:.4f}".format(cv_rmse))



Results reduced (99 features) feature set:

400:	learn: 10.9456783	total: 9.22s	remaining: 2.28s
499:	learn: 10.5033401	total: 11.4s	remaining: 0us

Base 80/20 CatBoost Performance:

RMSE: 10.8467

R²: 0.8978

Cross-validated RMSE: 11.3898

In [None]:
# Bayesian optimization of the CatBoost using the 99 feature set

import optuna
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Feature Engineering: Physics-Based Ratio, Thermal Conductivity Transformation, Log transformation
merged_data["mass_density_ratio"] = merged_data["wtd_mean_atomic_mass"] / (merged_data["wtd_mean_Density"] + 1e-9)
merged_data["affinity_valence_ratio"] = merged_data["wtd_mean_ElectronAffinity"] / (merged_data["wtd_mean_Valence"] + 1e-9)
merged_data["log_thermal_conductivity"] = np.log1p(merged_data["range_ThermalConductivity"])

# Define target and features
target = "critical_temp"
features = ['mean_atomic_mass', 'wtd_mean_atomic_mass', 'gmean_atomic_mass',
       'entropy_atomic_mass', 'wtd_entropy_atomic_mass', 'range_atomic_mass',
       'wtd_range_atomic_mass', 'wtd_std_atomic_mass', 'mean_fie',
       'wtd_mean_fie', 'wtd_entropy_fie', 'range_fie', 'wtd_range_fie',
       'wtd_std_fie', 'mean_atomic_radius', 'wtd_mean_atomic_radius',
       'gmean_atomic_radius', 'range_atomic_radius', 'wtd_range_atomic_radius',
       'mean_Density', 'wtd_mean_Density', 'gmean_Density', 'entropy_Density',
       'wtd_entropy_Density', 'range_Density', 'wtd_range_Density',
       'wtd_std_Density', 'mean_ElectronAffinity', 'wtd_mean_ElectronAffinity',
       'gmean_ElectronAffinity', 'wtd_gmean_ElectronAffinity',
       'entropy_ElectronAffinity', 'wtd_entropy_ElectronAffinity',
       'range_ElectronAffinity', 'wtd_range_ElectronAffinity',
       'wtd_std_ElectronAffinity', 'mean_FusionHeat', 'wtd_mean_FusionHeat',
       'gmean_FusionHeat', 'entropy_FusionHeat', 'wtd_entropy_FusionHeat',
       'range_FusionHeat', 'wtd_range_FusionHeat', 'wtd_std_FusionHeat',
       'mean_ThermalConductivity', 'wtd_mean_ThermalConductivity',
       'gmean_ThermalConductivity', 'wtd_gmean_ThermalConductivity',
       'entropy_ThermalConductivity', 'wtd_entropy_ThermalConductivity',
       'range_ThermalConductivity', 'wtd_range_ThermalConductivity',
       'mean_Valence', 'wtd_mean_Valence', 'range_Valence',
       'wtd_range_Valence', 'wtd_std_Valence', 'H', 'B', 'C', 'O', 'F', 'Na',
       'Mg', 'Al', 'Cl', 'K', 'Ca', 'V', 'Cr', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
       'As', 'Se', 'Sr', 'Y', 'Nb', 'Sn', 'I', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
       'Sm', 'Eu', 'Gd', 'Tb', 'Yb', 'Hg', 'Tl', 'Pb', 'Bi',
       'mass_density_ratio', 'affinity_valence_ratio',
       'log_thermal_conductivity']
X = merged_data[features]
y = merged_data[target]


# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1, log=True)
    depth = trial.suggest_int("depth", 4, 10)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 10)
    iterations = trial.suggest_int("iterations", 100, 1000)
    
    # Initialize CatBoostRegressor with the suggested hyperparameters
    model = CatBoostRegressor(
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        iterations=iterations,
        loss_function="RMSE",
        random_seed=42,
        verbose=0  # Suppress verbose output
    )
    
    # Evaluate using 5-fold cross-validation with negative RMSE scoring
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error")
    rmse = -np.mean(cv_scores)
    return rmse

# Create an Optuna study object, specifying that we want to minimize the objective
study = optuna.create_study(direction="minimize", study_name="catboost_opt")
study.optimize(objective, n_trials=30)

print("Best hyperparameters found:")
print(study.best_trial.params)
print("Best RMSE:", study.best_trial.value)


Results:

[I 2025-03-03 17:01:37,127] Trial 29 finished with value: 9.73032933391566 and parameters: {'learning_rate': 0.08464381700212753, 'depth': 10, 'l2_leaf_reg': 7.837898925719832, 'iterations': 503}. 

Best is trial 12 with value: 9.37859530361177.
Best hyperparameters found:
{'learning_rate': 0.09620611468282192, 'depth': 9, 'l2_leaf_reg': 4.192572316971277, 'iterations': 998}
Best RMSE: 9.37859530361177