In [None]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Importance Threshold for Selection
importance_threshold = 0.005  # Keep features with importance > 0.5%

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, tree_method="hist", random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost Performance
train_rmse_xgb = np.sqrt(mean_squared_error(y_train, xgb_model.predict(X_train)))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, xgb_model.predict(X_test)))
train_r2_xgb = r2_score(y_train, xgb_model.predict(X_train))
test_r2_xgb = r2_score(y_test, xgb_model.predict(X_test))

print(f"XGBoost - Train RMSE: {train_rmse_xgb:.4f}, Train R²: {train_r2_xgb:.4f}")
print(f"XGBoost - Test RMSE: {test_rmse_xgb:.4f}, Test R²: {test_r2_xgb:.4f}")

# Feature Importance (XGBoost)
xgb_importance = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
selected_features = xgb_importance[xgb_importance > importance_threshold].index.tolist()

# Train XGBoost Model with Selected Features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

xgb_model_selected = xgb.XGBRegressor(n_estimators=200, max_depth=6, tree_method="hist", random_state=42)
xgb_model_selected.fit(X_train_selected, y_train)

# Evaluate XGBoost (Reduced Features) Performance
train_rmse_xgb_selected = np.sqrt(mean_squared_error(y_train, xgb_model_selected.predict(X_train_selected)))
test_rmse_xgb_selected = np.sqrt(mean_squared_error(y_test, xgb_model_selected.predict(X_test_selected)))
train_r2_xgb_selected = r2_score(y_train, xgb_model_selected.predict(X_train_selected))
test_r2_xgb_selected = r2_score(y_test, xgb_model_selected.predict(X_test_selected))

print(f"XGBoost (Reduced Features) - Train RMSE: {train_rmse_xgb_selected:.4f}, Train R²: {train_r2_xgb_selected:.4f}")
print(f"XGBoost (Reduced Features) - Test RMSE: {test_rmse_xgb_selected:.4f}, Test R²: {test_r2_xgb_selected:.4f}")

# Train LightGBM Model with Selected Features
lgb_model = lgb.LGBMRegressor(n_estimators=200, max_depth=6, random_state=42)
lgb_model.fit(X_train_selected, y_train)

# Evaluate LightGBM Performance
train_rmse_lgb = np.sqrt(mean_squared_error(y_train, lgb_model.predict(X_train_selected)))
test_rmse_lgb = np.sqrt(mean_squared_error(y_test, lgb_model.predict(X_test_selected)))
train_r2_lgb = r2_score(y_train, lgb_model.predict(X_train_selected))
test_r2_lgb = r2_score(y_test, lgb_model.predict(X_test_selected))

print(f"LightGBM (Reduced Features) - Train RMSE: {train_rmse_lgb:.4f}, Train R²: {train_r2_lgb:.4f}")
print(f"LightGBM (Reduced Features) - Test RMSE: {test_rmse_lgb:.4f}, Test R²: {test_r2_lgb:.4f}")

# Save Feature Importance Ranking
xgb_importance.to_csv("feature_importance_ranking_xgb.csv")

# Print ranked features for comparison
print("Feature Importance Ranking (XGBoost):")
print(xgb_importance)

# Plot Feature Importance (XGBoost)
plt.figure(figsize=(10, 6))
xgb_importance[:20].plot(kind='barh')  # Show top 20 features
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Top Feature Importances in XGBoost Model")
plt.gca().invert_yaxis()
plt.show()


Results:

XGBoost - Train RMSE: 5.2047, Train R²: 0.9770
XGBoost - Test RMSE: 8.9142, Test R²: 0.9310
XGBoost (Reduced Features) - Train RMSE: 6.0891, Train R²: 0.9685
XGBoost (Reduced Features) - Test RMSE: 9.3395, Test R²: 0.9242


LightGBM (Reduced Features) - Train RMSE: 8.8844, Train R²: 0.9330
LightGBM (Reduced Features) - Test RMSE: 9.9381, Test R²: 0.9142
Feature Importance Ranking (XGBoost):
Cu                           0.631333
Ba                           0.051469
gmean_Valence                0.046137
Ca                           0.029114
range_ThermalConductivity    0.020274
...  
Hf                           0.000000
Os                           0.000000
Po                           0.000000
At                           0.000000
Rn                           0.000000