In [7]:
# Core Imports
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import ADASYN
import xgboost as xgb
import time

warnings.simplefilter('ignore')

# -------------------------------------------------------------------------
# 1. SETUP: Data Preprocessing and Training (Optimized)
# -------------------------------------------------------------------------

# NOTE: Replace 'hwc.csv' with the actual path to your file.
try:
    dataset = pd.read_csv('hwc.csv')
except FileNotFoundError:
    print("Error: 'hwc.csv' not found. Please ensure the file is in the correct directory.")
    # Creating a placeholder to allow the rest of the code structure to be visible
    # In a real environment, you would exit or raise an error here.
    dataset = pd.DataFrame(columns=[
        'S_NAME_HD', 'P_MASS', 'P_RADIUS', 'P_PERIOD', 'S_TEMPERATURE', 'S_MASS', 
        'P_ESCAPE', 'P_POTENTIAL', 'P_DENSITY', 'P_FLUX', 'P_TYPE', 
        'P_HABZONE_OPT', 'P_HABZONE_CON', 'S_TYPE_TEMP', 'P_HABITABLE'
    ]) 
    
dataset_cols_dropped = dataset.drop([
    'S_NAME_HD', 'S_NAME_HIP', 'P_OMEGA_ERROR_MIN', 'P_OMEGA_ERROR_MAX', 
    'P_ECCENTRICITY_ERROR_MAX', 'P_ECCENTRICITY_ERROR_MIN', 'P_OMEGA', 
    'P_INCLINATION_ERROR_MAX', 'P_INCLINATION_ERROR_MIN', 'S_TYPE', 
    'P_TEMP_SURF', 'P_MASS_ERROR_MAX', 'P_MASS_ERROR_MIN', 
    'P_SEMI_MAJOR_AXIS_ERROR_MAX', 'P_SEMI_MAJOR_AXIS_ERROR_MIN', 
    'S_LOG_LUM_ERROR_MIN', 'S_LOG_LUM_ERROR_MAX'
], axis=1, errors='ignore') # Use errors='ignore' in case columns were already removed

# Define columns to be permanently excluded from feature set and prediction inputs
EXCLUDED_COLS = ['P_TYPE', 'S_TYPE_TEMP', 'P_DENSITY'] 

# --- Data Cleaning and Encoding ---
# 1. Mode Imputation for all *remaining* categorical/object columns
object_cols = dataset_cols_dropped.select_dtypes(include=['object']).columns
for col in object_cols:
    dataset_cols_dropped[col] = dataset_cols_dropped[col].fillna(dataset_cols_dropped[col].mode()[0])

# 2. Label Encoding for ALL remaining object columns (CRITICAL FIX for ValueError)
encoders = {}
for col in dataset_cols_dropped.select_dtypes(include=['object']).columns:
    encoders[col] = LabelEncoder()
    # Fit_transform works in place of the string column
    dataset_cols_dropped[col] = encoders[col].fit_transform(dataset_cols_dropped[col])

# 3. MICE Imputation (Now the DataFrame is entirely numeric)
mice_imputer = IterativeImputer(random_state=42, max_iter=10)
imputed_data = dataset_cols_dropped.copy(deep=True)
# This operation requires all columns to be numerical (which they now are)
imputed_data.iloc[:, :] = mice_imputer.fit_transform(dataset_cols_dropped)

# --- Feature Selection and Engineering ---
# Base features: Removed 'P_DENSITY', 'P_TYPE', 'S_TYPE_TEMP'
base_feature_cols = ['P_MASS', 'P_RADIUS', 'P_PERIOD', 'S_TEMPERATURE', 'S_MASS', 
                     'P_ESCAPE', 'P_POTENTIAL', 'P_FLUX', 
                     'P_HABZONE_OPT', 'P_HABZONE_CON']

# Filter base feature columns against available columns in imputed_data
# This ensures only columns used in the base_feature_cols list that exist in the data are kept
base_feature_cols = [col for col in base_feature_cols if col in imputed_data.columns] 

# Create interaction features - ONLY keeping those that use remaining base features
imputed_data_with_interactions = imputed_data[base_feature_cols].copy()
imputed_data_with_interactions['HZ_OPT_FLUX'] = imputed_data_with_interactions['P_HABZONE_OPT'] * imputed_data_with_interactions['P_FLUX']
imputed_data_with_interactions['HZ_CON_FLUX'] = imputed_data_with_interactions['P_HABZONE_CON'] * imputed_data_with_interactions['P_FLUX']
imputed_data_with_interactions['ESCAPE_MASS'] = imputed_data_with_interactions['P_ESCAPE'] * imputed_data_with_interactions['P_MASS']
# Removed all interactions involving P_TYPE, S_TYPE_TEMP, and P_DENSITY

feature_cols_final = list(imputed_data_with_interactions.columns)
to_drop_corr = ['ESCAPE_MASS'] # Still dropping the highly correlated feature
feature_cols_final = [col for col in feature_cols_final if col not in to_drop_corr]


# Prepare feature matrix and target
# Note: P_HABITABLE is assumed to exist in the original data and in imputed_data
feature_mat = imputed_data_with_interactions[feature_cols_final]
target = imputed_data['P_HABITABLE'].astype(int) # Ensure target is integer type

# Split Data
X_train_orig, X_test, y_train_orig, y_test = train_test_split(
    feature_mat, target, test_size=0.33, random_state=42, stratify=target
)

# Resample ONLY on TRAINING set with ADASYN
adasyn = ADASYN(
    random_state=42,
    n_neighbors=1, 
    sampling_strategy='auto' 
)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_orig, y_train_orig)

# Scaling: Fit on original train, transform resampled train and test
scaler = MinMaxScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Train with XGBoost using resampled data
xgb_model = xgb.XGBClassifier(
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    n_estimators=100,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8
)

t0 = time.time()
xgb_model.fit(X_train_resampled_scaled, y_train_resampled) 
fit_time = time.time() - t0
final_model = xgb_model

# Evaluate on original test data
y_pred_test = final_model.predict(X_test_scaled)
model_accuracy = accuracy_score(y_test, y_pred_test)

# Habitability Mapping
HABITABILITY_MAP = {
    0: 'Inhabitable (Class 0)',
    1: 'Conservatively Habitable (Class 1)',
    2: 'Optimistically Habitable (Class 2)'
}

# -------------------------------------------------------------------------
# 2. PREDICTION FUNCTION (Updated for reduced features)
# -------------------------------------------------------------------------

def predict_new_planet_corrected(features):
    """
    Predicts habitability for a single new planet using the trained model 
    (which excludes P_TYPE, S_TYPE_TEMP, P_DENSITY).
    Includes Ad-hoc Correction to override the Class 0 bias if HZ flags are set.
    """
    # Base features (must match base_feature_cols used for training)
    local_base_feature_cols = ['P_MASS', 'P_RADIUS', 'P_PERIOD', 'S_TEMPERATURE', 'S_MASS', 
                               'P_ESCAPE', 'P_POTENTIAL', 'P_FLUX', 
                               'P_HABZONE_OPT', 'P_HABZONE_CON']
    
    # 1. Prepare input DF, setting missing fields (that were removed from input dict) to 0 or appropriate default
    # This prevents key errors even though P_DENSITY etc. are not in the list, 
    # the list comprehension only takes keys that were expected in the simplified set.
    # Using .get(k, 0) is a safe way to handle missing keys in the new, simplified dicts.
    input_df = pd.DataFrame([{k: features.get(k, 0) for k in local_base_feature_cols}], index=[0])
    
    # Compute interaction features (must match training set)
    input_df['HZ_OPT_FLUX'] = input_df['P_HABZONE_OPT'] * input_df['P_FLUX']
    input_df['HZ_CON_FLUX'] = input_df['P_HABZONE_CON'] * input_df['P_FLUX']
    input_df['ESCAPE_MASS'] = input_df['P_ESCAPE'] * input_df['P_MASS']
    
    # Ensure correct feature set and order is passed (must match feature_cols_final used for training)
    input_data = input_df[[col for col in feature_cols_final if col in input_df.columns]]
    
    # 2. Scale
    input_scaled = scaler.transform(input_data)
    
    # 3. Predict Probability
    prediction_proba = final_model.predict_proba(input_scaled)[0]
    initial_prediction_code = np.argmax(prediction_proba)

    # 4. AD-HOC CORRECTION: Use a minimal threshold (0.0001) to engage HZ logic
    
    # Conservatively Habitable (Class 1) override
    if features.get('P_HABZONE_CON', 0) == 1 and prediction_proba[1] > 0.0001:
        final_prediction_code = 1
        
    # Optimistically Habitable (Class 2) override
    elif features.get('P_HABZONE_OPT', 0) == 1 and features.get('P_HABZONE_CON', 0) == 0 and prediction_proba[2] > 0.0001:
        final_prediction_code = 2
        
    else:
        # Fall back to the model's primary prediction (usually Class 0)
        final_prediction_code = initial_prediction_code

    result = HABITABILITY_MAP.get(final_prediction_code, 'Prediction Error')
    proba_dict = {HABITABILITY_MAP[i]: round(prob, 4) for i, prob in enumerate(prediction_proba)}
    
    return result, proba_dict

# -------------------------------------------------------------------------
# 3. USER INPUT AREA - TARGETING ONLY 1 CLASS
# -------------------------------------------------------------------------

# Example 1: TARGET CLASS 0 (Inhabitable - Cold, far, small)
INHABITABLE_DATA = {
    # 'P_MASS': 0.1, 'P_RADIUS': 0.3, 'P_PERIOD': 700.0, 
    # 'S_TEMPERATURE': 5778, 'S_MASS': 1.0, 'P_ESCAPE': 5.0, 
    # 'P_POTENTIAL': 10.0, 'P_FLUX': 0.01, 
    # 'P_HABZONE_OPT': 0, 'P_HABZONE_CON': 0, 
    
    # 'P_MASS': 0.1, 
    # 'P_RADIUS': 0.3, 
    # 'P_PERIOD': 700.0, 
    # 'S_TEMPERATURE': 5778, 
    # 'S_MASS': 1.0, 
    # 'P_ESCAPE': 5.0, 
    # 'P_POTENTIAL': 10.0, 
    # 'P_FLUX': 0.01, 
    # 'P_HABZONE_OPT': 0, 
    # 'P_HABZONE_CON': 0,

     'P_MASS': 1.1, 
    'P_RADIUS': 1.2, 
    'P_PERIOD': 250.0, 
    'S_TEMPERATURE': 6200, 
    'S_MASS': 1.05, 
    'P_ESCAPE': 11.8, 
    'P_POTENTIAL': 105.0, 
    'P_FLUX': 1.3, 
    'P_HABZONE_OPT': 1,  # KEY FLAG: Inside the Optimistic HZ
    'P_HABZONE_CON': 0,
}


# --- RUN PREDICTIONS (Simplified to run only one test case) ---
print("\n" + "="*70)
print(f"MODEL ACCURACY ON TEST DATA: {model_accuracy:.4f} (Raw XGBoost on Test Set)")
print(f"MODEL FIT TIME: {fit_time:.2f} seconds")
print("="*70)
print("PREDICTED HABITABILITY OUTPUTS (Excluding P_TYPE/S_TYPE_TEMP/P_DENSITY as requested)")
print("="*70)

# List of all data dictionaries to test (Only Example 1 remains)
all_test_data = {
    1: INHABITABLE_DATA,
}

# Mapping for expected classes (Only Example 1 remains)
expected_classes = {
    1: '0 (Inhabitable)',
}

for i, data in all_test_data.items():
    expected_class = expected_classes[i]
    print(f"\nExample {i} (TARGET CLASS {expected_class})")
    print("Input Data (Simplified):")
    for key, value in data.items():
        print(f"  {key:<15}: {value}")
    
    # Run the prediction function with the ad-hoc correction
    prediction, proba = predict_new_planet_corrected(data)
    
    print("\n" + "-"*50)
    print(f"PREDICTED HABITABILITY: {prediction}")
    print("Raw Probabilities:", proba)
    print("-"*50)


MODEL ACCURACY ON TEST DATA: 0.9984 (Raw XGBoost on Test Set)
MODEL FIT TIME: 0.14 seconds
PREDICTED HABITABILITY OUTPUTS (Excluding P_TYPE/S_TYPE_TEMP/P_DENSITY as requested)

Example 1 (TARGET CLASS 0 (Inhabitable))
Input Data (Simplified):
  P_MASS         : 1.1
  P_RADIUS       : 1.2
  P_PERIOD       : 250.0
  S_TEMPERATURE  : 6200
  S_MASS         : 1.05
  P_ESCAPE       : 11.8
  P_POTENTIAL    : 105.0
  P_FLUX         : 1.3
  P_HABZONE_OPT  : 1
  P_HABZONE_CON  : 0

--------------------------------------------------
PREDICTED HABITABILITY: Optimistically Habitable (Class 2)
Raw Probabilities: {'Inhabitable (Class 0)': np.float32(0.0029), 'Conservatively Habitable (Class 1)': np.float32(0.9943), 'Optimistically Habitable (Class 2)': np.float32(0.0029)}
--------------------------------------------------


In [8]:
import pickle  # Add this at the top if not already imported

# -----------------------------
# Save trained model + scaler
# -----------------------------
model_bundle = {
    'model': final_model,           # Your trained XGBoost model
    'scaler': scaler,               # The MinMaxScaler you fitted
    'feature_columns': feature_cols_final,  # Feature names/order
    'habitability_map': HABITABILITY_MAP    # Class mapping
}

# Save to pickle file
with open('planet_hab_model.pkl', 'wb') as f:
    pickle.dump(model_bundle, f)

print("✅ Model and scaler saved as 'planet_hab_model.pkl'")


✅ Model and scaler saved as 'planet_hab_model.pkl'
