In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Shuffle data before splitting
X, y = shuffle(X, y, random_state=42)

# Feature Engineering: Creating New Features
X["atomic_radius_valence"] = X["range_atomic_radius"] * X["gmean_Valence"]
X["density_atomic_mass"] = X["wtd_gmean_Density"] / (X["wtd_entropy_atomic_mass"] + 1e-9)
X["thermal_density"] = X["wtd_gmean_ThermalConductivity"] * X["wtd_gmean_Density"]

# Log Transformations (for highly skewed features)
X["log_atomic_radius"] = np.log1p(X["range_atomic_radius"])
X["log_electron_affinity"] = np.log1p(X["wtd_std_ElectronAffinity"])

# Statistical Aggregations
X["mean_thermal_conductivity"] = X[["wtd_gmean_ThermalConductivity", "wtd_std_ThermalConductivity"]].mean(axis=1)
X["var_electron_affinity"] = X[["wtd_std_ElectronAffinity", "wtd_entropy_ElectronAffinity"]].var(axis=1)

# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Generate predictions for validation and test sets
y_pred_lgb_valid = optimized_lgb.predict(X_valid)
y_pred_xgb_valid = optimized_xgb.predict(X_valid)
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Use Previously Optimized Weights for Blending
best_weight_lgb = 0.3454  # Previously found weight
best_weight_xgb = 1.0 - best_weight_lgb

# Apply Best Weights to Validation and Test Sets
y_pred_ensemble_valid = (best_weight_lgb * y_pred_lgb_valid) + (best_weight_xgb * y_pred_xgb_valid)
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Blended Model on Validation and Test Sets
ensemble_rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_ensemble_valid))
ensemble_r2_valid = r2_score(y_valid, y_pred_ensemble_valid)
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

# Print results
print(f"Optimized Weighted Blended Model - Validation RMSE: {ensemble_rmse_valid:.4f}, Validation R²: {ensemble_r2_valid:.4f}")
print(f"Optimized Weighted Blended Model - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")
print(f"Optimal Blending Weights Used: LightGBM={best_weight_lgb:.4f}, XGBoost={best_weight_xgb:.4f}")


Results:
Optimized Weighted Blended Model - Validation RMSE: 8.2104, Validation R²: 0.9416
Optimized Weighted Blended Model - Test RMSE: 9.4789, Test R²: 0.9235
Optimal Blending Weights Used: LightGBM=0.3454, XGBoost=0.6546

The new features improved the Validation score, but worstened the Test score.


In [None]:
# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True, verbose=-1, random_state=42)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Feature Importance Analysis Before Removing Features
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
ensemble_importance[:20].plot(kind='barh')  # Show top 20 features
plt.xlabel("Blended Feature Importance")
plt.ylabel("Feature Name")
plt.title("Top Feature Importances in Optimized Blended Model")
plt.gca().invert_yaxis()
plt.show()

# Print top features
print("Top 20 Features in Optimized Blended Model:")
print(ensemble_importance[:20])


In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True, verbose=-1, random_state=42)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Feature Importance Analysis Before Removing Features
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Remove features with very low importance
low_importance_threshold = 0.005
low_importance_features = ensemble_importance[ensemble_importance < low_importance_threshold].index
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)

# Compute correlation matrix and remove highly correlated features
corr_matrix = X_train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlation_threshold = 0.95
high_correlation_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
X_train = X_train.drop(columns=high_correlation_features)
X_valid = X_valid.drop(columns=high_correlation_features)
X_test = X_test.drop(columns=high_correlation_features)

# Retrain models after feature selection
optimized_lgb.fit(X_train, y_train)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Apply Blending Weights
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Model
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

print(f"Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")


In [None]:
# Save selected feature lists to /output
output_path = "./output/selected_features.txt"

with open(output_path, "w") as f:
    f.write("Selected Features After Feature Selection:\n")
    for feature in X_train.columns:
        f.write(feature + "\n")

print(f"Feature list saved to {output_path}")

Results:

Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: 8.5381, Test R²: 0.9356


39 selected features!

Selected Features After Feature Selection:
wtd_mean_atomic_mass
wtd_entropy_atomic_mass
wtd_range_atomic_mass
std_atomic_mass
wtd_std_atomic_mass
wtd_mean_fie
wtd_entropy_fie
wtd_range_fie
wtd_std_fie
wtd_mean_atomic_radius
range_atomic_radius
wtd_range_atomic_radius
wtd_mean_Density
gmean_Density
wtd_entropy_Density
wtd_std_Density
wtd_mean_ElectronAffinity
wtd_gmean_ElectronAffinity
wtd_entropy_ElectronAffinity
wtd_range_ElectronAffinity
wtd_std_ElectronAffinity
wtd_mean_FusionHeat
wtd_entropy_FusionHeat
wtd_std_FusionHeat
wtd_mean_ThermalConductivity
wtd_gmean_ThermalConductivity
wtd_entropy_ThermalConductivity
range_ThermalConductivity
wtd_mean_Valence
gmean_Valence
wtd_range_Valence
wtd_std_Valence
O
Ca
Fe
Cu
Zn
As
Ba
Hg


In [None]:
# Define newly engineered features
new_features = [
    "atomic_radius_valence", "density_atomic_mass", "thermal_density",
    "log_atomic_radius", "log_electron_affinity",
    "mean_thermal_conductivity", "var_electron_affinity"
]

# Identify selected and rejected new features
selected_new_features = [feature for feature in new_features if feature in X_train.columns]
rejected_new_features = [feature for feature in new_features if feature not in X_train.columns]

# Save to output file
output_path = "./output/new_feature_selection.txt"
with open(output_path, "w") as f:
    f.write("Selected New Features:\n")
    for feature in selected_new_features:
        f.write(feature + "\n")
    f.write("\nRejected New Features:\n")
    for feature in rejected_new_features:
        f.write(feature + "\n")

# Print result summary
print(f"Selected New Features: {selected_new_features}")
print(f"Rejected New Features: {rejected_new_features}")
print(f"Feature selection summary saved to {output_path}")

So, none of the new features were selected.

Selected New Features: []
Rejected New Features: ['atomic_radius_valence', 'density_atomic_mass', 'thermal_density', 'log_atomic_radius', 'log_electron_affinity', 'mean_thermal_conductivity', 'var_electron_affinity']

I can consider new ones...