In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Ensure directory exists
os.makedirs('models', exist_ok=True)

In [2]:
# 1. LOAD DATA
X_train = pd.read_csv('X_train_final.csv')
y_train_reg = pd.read_csv('y_train_regression.csv').values.ravel()
X_test = pd.read_csv('X_test_final.csv')
y_test_cont = pd.read_csv('y_test_final.csv').values.ravel()
y_test_bin = (y_test_cont > 0.75).astype(int) # Threshold for evaluation



In [3]:
# 2. TUNING THE BASE MODELS
# XGBoost: Focus on Learning Rate and Depth
xgb_base = XGBRegressor(random_state=42, objective='reg:squarederror')
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7]
}
xgb_grid = GridSearchCV(xgb_base, xgb_param_grid, cv=3, scoring='r2', n_jobs=-1)
xgb_grid.fit(X_train, y_train_reg)
best_xgb = xgb_grid.best_estimator_

# Random Forest: Focus on Depth and Estimators
rf_base = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(rf_base, rf_param_grid, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train_reg)
best_rf = rf_grid.best_estimator_



ValueError: Found input variables with inconsistent numbers of samples: [8660, 4355]

In [None]:
# 3. CONSTRUCT THE WEIGHTED ENSEMBLE
# Giving 70% weight to XGBoost based on our baseline observations
ensemble = VotingRegressor(
    estimators=[('xgb', best_xgb), ('rf', best_rf)],
    weights=[0.7, 0.3]
)

# Create final pipeline
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', ensemble)
])

final_pipeline.fit(X_train, y_train_reg)



In [None]:
# 4. EVALUATION & RANKING
y_pred_cont = final_pipeline.predict(X_test)
y_pred_bin = (y_pred_cont > 0.75).astype(int) # Optimized threshold

# Mandatory Metrics
print("\n--- Final Ensemble Regression Results ---")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test_cont, y_pred_cont):.4f}")
print(f"R-squared Score (R2): {r2_score(y_test_cont, y_pred_cont):.4f}")

print("\n--- Final Ensemble Classification Performance ---")
print(classification_report(y_test_bin, y_pred_bin))

# 5. FEATURE IMPORTANCE (From Random Forest component)
# Required by mentor document
plt.figure(figsize=(10, 6))
importances = best_rf.feature_importances_
indices = np.argsort(importances)[-15:] # Top 15 features
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.title('Top 15 Contributions to Habitability Prediction')
plt.show()

# 6. EXPORT FINAL RANKING
# Ranking all planets in the test set
results_df = pd.DataFrame({
    'habitability_score': y_pred_cont
})
results_df.to_csv('data/processed/habitability_ranked.csv', index=False)
joblib.dump(final_pipeline, 'models/ensemble_habitability_model.pkl')