In [None]:
# this compares the twp models I want to evaluate: RF and XGBoost 

import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, tree_method="hist", random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost Performance
train_rmse_xgb = np.sqrt(mean_squared_error(y_train, xgb_model.predict(X_train)))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, xgb_model.predict(X_test)))
train_r2_xgb = r2_score(y_train, xgb_model.predict(X_train))
test_r2_xgb = r2_score(y_test, xgb_model.predict(X_test))

print(f"XGBoost - Train RMSE: {train_rmse_xgb:.4f}, Train R²: {train_r2_xgb:.4f}")
print(f"XGBoost - Test RMSE: {test_rmse_xgb:.4f}, Test R²: {test_r2_xgb:.4f}")

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=200, max_depth=6, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest Performance
train_rmse_rf = np.sqrt(mean_squared_error(y_train, rf_model.predict(X_train)))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, rf_model.predict(X_test)))
train_r2_rf = r2_score(y_train, rf_model.predict(X_train))
test_r2_rf = r2_score(y_test, rf_model.predict(X_test))

print(f"Random Forest - Train RMSE: {train_rmse_rf:.4f}, Train R²: {train_r2_rf:.4f}")
print(f"Random Forest - Test RMSE: {test_rmse_rf:.4f}, Test R²: {test_r2_rf:.4f}")

# Feature Importance (XGBoost)
xgb_importance = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
xgb_importance.to_csv("feature_importance_ranking_xgb.csv")

# Feature Importance (Random Forest)
rf_importance = pd.Series(rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
rf_importance.to_csv("feature_importance_ranking_rf.csv")

# Print ranked features for comparison
print("Feature Importance Ranking (XGBoost):")
print(xgb_importance)
print("\nFeature Importance Ranking (Random Forest):")
print(rf_importance)

# Plot Feature Importance Side-by-Side
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# XGBoost Feature Importance
axes[0].barh(xgb_importance.index[:20], xgb_importance.values[:20])
axes[0].set_title("XGBoost Feature Importance")
axes[0].set_xlabel("Importance")
axes[0].invert_yaxis()

# Random Forest Feature Importance
axes[1].barh(rf_importance.index[:20], rf_importance.values[:20])
axes[1].set_title("Random Forest Feature Importance")
axes[1].set_xlabel("Importance")
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()


Results:

XGBoost - Train RMSE: 5.2047, Train R²: 0.9770
XGBoost - Test RMSE: 8.9142, Test R²: 0.9310
Random Forest - Train RMSE: 13.0025, Train R²: 0.8566
Random Forest - Test RMSE: 13.1421, Test R²: 0.8500

Feature Importance Ranking (XGBoost):
Cu                           0.631333
Ba                           0.051469
gmean_Valence                0.046137
Ca                           0.029114
range_ThermalConductivity    0.020274
...   
Hf                           0.000000
Os                           0.000000
Po                           0.000000
At                           0.000000
Rn                           0.000000
Length: 167, dtype: float32

Feature Importance Ranking (Random Forest):
Cu                   0.720902
Ca                   0.055018
Ba                   0.031046
gmean_Valence        0.021857
wtd_gmean_Density    0.021306
...  
Ta                   0.000000
Au                   0.000000
Po                   0.000000
At                   0.000000
Rn                   0.000000

The RF is clearly outperformed by the XGBoot on this dataset. They both have similar features importance, but not exactly the same.