In [None]:
%pip install catboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats.mstats import winsorize
import warnings
warnings.filterwarnings("ignore")

In [3]:
# --- load data ---
data = pd.read_csv("/content/tess.csv", comment="#")

In [4]:
# --- target & features ---
target_col = "tfopwg_disp"
feature_cols = [
    "ra", "dec",
    "st_teff", "st_logg", "st_rad", "st_dist",
    "st_pmra", "st_pmdec", "st_tmag",
    "pl_orbper", "pl_rade", "pl_trandep", "pl_trandurh", "pl_eqt", "pl_insol", 'pl_tranmid', 'pl_pnum'
]

In [5]:
for col in feature_cols:
    data[col] = winsorize(data[col], limits=[0.01,0.01])

In [6]:
df = data[feature_cols + [target_col]].copy()

In [7]:
#df['st_teff_rad'] = df['st_teff'] * df['st_rad']       # Yıldızın radyatif proxy
#df['logg_rad'] = df['st_logg'] * df['st_rad']         # Yoğunluk benzeri bilgi
#df['dep_rade_ratio'] = df['pl_trandep'] / df['pl_rade']  # Transit yoğunluğu
#df['dur_orb_ratio'] = df['pl_trandurh'] / df['pl_orbper']  # Transit süresi / periyot

In [8]:
# ========================================
# 1. TRANSIT-BASED FEATURES (MOST IMPORTANT!)
# ========================================
print("\n[1] Engineering Transit-Based Features...")

def create_transit_features(df):
    """
    Transit characteristics are THE key to identifying exoplanets
    """

    # --- Transit Depth Ratio ---
    # Deeper transits = larger planets or specific orbital geometry
    # Formula: (R_planet / R_star)^2 approximates transit depth
    if 'pl_trandep' in df.columns and 'st_rad' in df.columns and 'pl_rade' in df.columns:
        # Transit depth in ppm (parts per million)
        df['transit_depth_normalized'] = df['pl_trandep'] / 1e6

        # Expected vs actual transit depth ratio
        # Theoretical depth = (R_planet/R_star)^2
        R_sun_to_earth = 109.2  # Sun radius in Earth radii
        theoretical_depth = (df['pl_rade'] / (df['st_rad'] * R_sun_to_earth)) ** 2
        df['transit_depth_anomaly'] = df['transit_depth_normalized'] / (theoretical_depth + 1e-10)

    # --- Transit Duration Features ---
    if 'pl_trandurh' in df.columns and 'pl_orbper' in df.columns:
        # Transit duration as fraction of orbital period
        df['transit_duration_fraction'] = df['pl_trandurh'] / (df['pl_orbper'] * 24)

        # Short duration transits = grazing/edge-on orbits
        df['is_short_transit'] = (df['pl_trandurh'] < 2).astype(int)

        # Very long transits might be false positives (eclipsing binaries)
        df['is_long_transit'] = (df['pl_trandurh'] > 10).astype(int)

    # --- Transit Timing ---
    if 'pl_tranmid' in df.columns and 'pl_orbper' in df.columns:
        # Phase of the transit (useful for detecting variations)
        df['transit_phase'] = (df['pl_tranmid'] % df['pl_orbper']) / df['pl_orbper']

    return df


[1] Engineering Transit-Based Features...


In [9]:
# ========================================
# 2. PLANETARY CHARACTERISTICS
# ========================================
print("[2] Engineering Planetary Characteristic Features...")

def create_planetary_features(df):
    """
    Features describing the planet's physical properties
    """

    # --- Planet Size Categories ---
    if 'pl_rade' in df.columns:
        # Categorize by size (astronomically meaningful)
        df['planet_size_category'] = pd.cut(
            df['pl_rade'],
            bins=[0, 1.5, 2.0, 4.0, 10.0, np.inf],
            labels=['Earth-like', 'Super-Earth', 'Mini-Neptune', 'Neptune', 'Jupiter']
        )

        # Binary flags for important size ranges
        df['is_earth_sized'] = ((df['pl_rade'] >= 0.5) & (df['pl_rade'] <= 1.5)).astype(int)
        df['is_super_earth'] = ((df['pl_rade'] > 1.5) & (df['pl_rade'] <= 2.0)).astype(int)
        df['is_neptune_sized'] = ((df['pl_rade'] > 4.0) & (df['pl_rade'] <= 10.0)).astype(int)
        df['is_jupiter_sized'] = (df['pl_rade'] > 10.0).astype(int)

    # --- Orbital Period Features ---
    if 'pl_orbper' in df.columns:
        # Log scale (orbital periods span orders of magnitude)
        df['pl_orbper_log'] = np.log10(df['pl_orbper'] + 1)

        # Hot Jupiters (very short periods)
        df['is_hot_jupiter'] = ((df['pl_orbper'] < 10) & (df['pl_rade'] > 8)).astype(int)

        # Habitable zone indicator (rough estimate: 200-500 days for Sun-like stars)
        df['in_habitable_zone_period'] = ((df['pl_orbper'] >= 200) & (df['pl_orbper'] <= 500)).astype(int)

    # --- Insolation (Stellar Radiation) ---
    if 'pl_insol' in df.columns:
        # Log scale for insolation
        df['pl_insol_log'] = np.log10(df['pl_insol'] + 1)

        # Earth-like insolation (0.5 to 2.0 Earth flux)
        df['earth_like_insolation'] = ((df['pl_insol'] >= 0.5) & (df['pl_insol'] <= 2.0)).astype(int)

        # Hot planets (high insolation)
        df['is_hot_planet'] = (df['pl_insol'] > 10).astype(int)

    # --- Equilibrium Temperature ---
    if 'pl_eqt' in df.columns:
        df['pl_eqt_celsius'] = df['pl_eqt'] - 273.15  # Convert to Celsius

        # Habitable temperature range (0-100°C for liquid water)
        df['habitable_temperature'] = ((df['pl_eqt_celsius'] >= 0) & (df['pl_eqt_celsius'] <= 100)).astype(int)

        # Temperature categories
        df['temp_category'] = pd.cut(
            df['pl_eqt'],
            bins=[0, 273, 373, 500, 1000, np.inf],
            labels=['Frozen', 'Temperate', 'Warm', 'Hot', 'Very Hot']
        )

    return df


[2] Engineering Planetary Characteristic Features...


In [10]:
# ========================================
# 3. STELLAR PROPERTIES
# ========================================
print("[3] Engineering Stellar Property Features...")

def create_stellar_features(df):
    """
    Host star characteristics affect detectability and planet properties
    """

    # --- Stellar Temperature ---
    if 'st_teff' in df.columns:
        # Log scale
        df['st_teff_log'] = np.log10(df['st_teff'] + 1)

        # Stellar type classification (simplified)
        df['stellar_type'] = pd.cut(
            df['st_teff'],
            bins=[0, 3700, 5200, 6000, 7500, np.inf],
            labels=['M-dwarf', 'K-type', 'G-type', 'F-type', 'A-type+']
        )

        # Sun-like stars (G-type: 5200-6000K)
        df['is_sun_like'] = ((df['st_teff'] >= 5200) & (df['st_teff'] <= 6000)).astype(int)

        # Cool stars (M-dwarfs are common exoplanet hosts)
        df['is_m_dwarf'] = (df['st_teff'] < 3700).astype(int)

    # --- Stellar Radius ---
    if 'st_rad' in df.columns:
        # Log scale
        df['st_rad_log'] = np.log10(df['st_rad'] + 1)

        # Main sequence stars (typical radius 0.1 to 2.0 solar radii)
        df['is_main_sequence'] = ((df['st_rad'] >= 0.1) & (df['st_rad'] <= 2.0)).astype(int)

        # Giant stars
        df['is_giant_star'] = (df['st_rad'] > 2.0).astype(int)

    # --- Surface Gravity ---
    if 'st_logg' in df.columns:
        # log(g) is already in log scale, but we can categorize
        # Lower log(g) = evolved stars (giants/subgiants)
        df['is_evolved_star'] = (df['st_logg'] < 4.0).astype(int)
        df['is_main_sequence_logg'] = ((df['st_logg'] >= 4.0) & (df['st_logg'] <= 4.5)).astype(int)

    # --- Distance ---
    if 'st_dist' in df.columns:
        # Log scale (distances vary greatly)
        df['st_dist_log'] = np.log10(df['st_dist'] + 1)

        # Nearby stars (better data quality)
        df['is_nearby'] = (df['st_dist'] < 50).astype(int)  # within 50 parsecs

        # Distance categories
        df['distance_category'] = pd.cut(
            df['st_dist'],
            bins=[0, 50, 100, 200, 500, np.inf],
            labels=['Very Close', 'Close', 'Medium', 'Far', 'Very Far']
        )

    # --- TESS Magnitude (Brightness) ---
    if 'st_tmag' in df.columns:
        # Brighter stars (lower magnitude) = better signal
        df['is_bright_star'] = (df['st_tmag'] < 10).astype(int)
        df['is_faint_star'] = (df['st_tmag'] > 14).astype(int)

    return df


[3] Engineering Stellar Property Features...


In [11]:
# ========================================
# 4. COMBINED PLANETARY-STELLAR FEATURES
# ========================================
print("[4] Engineering Combined Features...")

def create_combined_features(df):
    """
    Ratios and interactions between planetary and stellar properties
    These are VERY important for exoplanet detection!
    """

    # --- Planet-Star Size Ratio ---
    if 'pl_rade' in df.columns and 'st_rad' in df.columns:
        R_sun_to_earth = 109.2
        df['planet_star_radius_ratio'] = df['pl_rade'] / (df['st_rad'] * R_sun_to_earth)

        # Easier to detect large planets around small stars
        df['high_contrast_system'] = (df['planet_star_radius_ratio'] > 0.05).astype(int)

    # --- Semi-major Axis (Orbital Distance) ---
    if 'pl_orbper' in df.columns and 'st_rad' in df.columns:
        # Kepler's Third Law (simplified): a ∝ P^(2/3)
        # For a Sun-mass star: a (AU) ≈ (P_days / 365.25)^(2/3)
        df['semi_major_axis_approx'] = (df['pl_orbper'] / 365.25) ** (2/3)

        # Planet-star distance in stellar radii
        AU_to_solar_radii = 215  # 1 AU ≈ 215 solar radii
        df['distance_in_stellar_radii'] = df['semi_major_axis_approx'] * AU_to_solar_radii / df['st_rad']

    # --- Impact Parameter (Transit Geometry) ---
    if 'pl_trandurh' in df.columns and 'pl_orbper' in df.columns and 'st_rad' in df.columns:
        # Estimate impact parameter from transit duration
        # b = sqrt(1 - (T_dur / T_dur_max)^2)
        # T_dur_max = (P/π) * (R_star/a) for central transit

        # This is a simplified approximation
        df['transit_duration_ratio'] = df['pl_trandurh'] / (df['pl_orbper'] * 24 / np.pi)

        # Grazing transits (high impact parameter)
        df['is_grazing_transit'] = (df['transit_duration_ratio'] < 0.1).astype(int)

    # --- Signal-to-Noise Proxy ---
    if 'pl_trandep' in df.columns and 'st_tmag' in df.columns:
        # Deeper transits on brighter stars = better detection
        # Lower magnitude = brighter star
        df['detection_quality'] = df['pl_trandep'] / (10 ** (df['st_tmag'] / 5))

        # High quality detections
        df['high_snr_detection'] = (df['detection_quality'] > df['detection_quality'].quantile(0.75)).astype(int)

    # --- Density Proxy ---
    if 'pl_rade' in df.columns and 'pl_orbper' in df.columns:
        # Can't calculate exact density without mass, but we can approximate
        # Rocky planets have higher density than gas giants
        # This is a rough proxy: smaller radius + shorter period often = rocky
        df['density_proxy'] = 1 / (df['pl_rade'] ** 2 * df['pl_orbper'] ** 0.5)

        # Likely rocky planet
        df['likely_rocky'] = ((df['pl_rade'] < 2.0) & (df['density_proxy'] > df['density_proxy'].median())).astype(int)

    return df


[4] Engineering Combined Features...


In [12]:
# ========================================
# 5. POSITIONAL AND MOTION FEATURES
# ========================================
print("[5] Engineering Positional Features...")

def create_positional_features(df):
    """
    Spatial location and proper motion
    """

    # --- Galactic Coordinates (if needed) ---
    if 'ra' in df.columns and 'dec' in df.columns:
        # Convert RA/Dec to radians for calculations
        ra_rad = np.deg2rad(df['ra'])
        dec_rad = np.deg2rad(df['dec'])

        # Cartesian coordinates on unit sphere
        df['pos_x'] = np.cos(dec_rad) * np.cos(ra_rad)
        df['pos_y'] = np.cos(dec_rad) * np.sin(ra_rad)
        df['pos_z'] = np.sin(dec_rad)

        # Distance from galactic plane (simplified)
        df['abs_dec'] = np.abs(df['dec'])

    # --- Proper Motion ---
    if 'st_pmra' in df.columns and 'st_pmdec' in df.columns:
        # Total proper motion
        df['proper_motion_total'] = np.sqrt(df['st_pmra']**2 + df['st_pmdec']**2)

        # High proper motion = nearby stars
        df['high_proper_motion'] = (df['proper_motion_total'] > 50).astype(int)

    return df


[5] Engineering Positional Features...


In [13]:
# ========================================
# 6. FALSE POSITIVE INDICATORS
# ========================================
print("[6] Engineering False Positive Detection Features...")

def create_false_positive_features(df):
    """
    Features that help identify false positives (eclipsing binaries, etc.)
    """

    # --- Eclipsing Binary Indicators ---
    # Deep transits + long duration = possible binary
    if 'pl_trandep' in df.columns and 'pl_trandurh' in df.columns:
        df['deep_long_transit'] = ((df['pl_trandep'] > 10000) & (df['pl_trandurh'] > 8)).astype(int)

    # --- V-shaped vs U-shaped transits ---
    # (This would require light curve data, but we can use duration/depth ratio)
    if 'pl_trandurh' in df.columns and 'pl_trandep' in df.columns:
        df['transit_shape_proxy'] = df['pl_trandurh'] / np.sqrt(df['pl_trandep'] + 1)

        # V-shaped transits might be grazing binaries
        df['possible_binary'] = (df['transit_shape_proxy'] < df['transit_shape_proxy'].quantile(0.1)).astype(int)

    # --- Odd-even transit depth variations ---
    # (Would need multiple transit observations, but can flag based on uncertainty)
    if 'pl_trandeperr1' in df.columns and 'pl_trandep' in df.columns:
        df['transit_depth_uncertainty_ratio'] = df['pl_trandeperr1'] / (df['pl_trandep'] + 1)

        # High uncertainty = less reliable detection
        df['uncertain_detection'] = (df['transit_depth_uncertainty_ratio'] > 0.3).astype(int)

    return df

[6] Engineering False Positive Detection Features...


In [14]:
# ========================================
# 7. STATISTICAL AGGREGATIONS
# ========================================
print("[7] Creating Statistical Features...")

def create_statistical_features(df):
    """
    Ratios of uncertainties, derived quantities
    """

    # --- Measurement Quality Indicators ---
    # Lower uncertainty = better quality
    uncertainty_cols = [col for col in df.columns if 'err1' in col or 'err2' in col]

    if 'pl_orbpererr1' in df.columns and 'pl_orbper' in df.columns:
        df['period_uncertainty_ratio'] = df['pl_orbpererr1'] / (df['pl_orbper'] + 1e-10)

    if 'pl_radeerr1' in df.columns and 'pl_rade' in df.columns:
        df['radius_uncertainty_ratio'] = df['pl_radeerr1'] / (df['pl_rade'] + 1e-10)

    if 'st_raderr1' in df.columns and 'st_rad' in df.columns:
        df['stellar_radius_uncertainty_ratio'] = df['st_raderr1'] / (df['st_rad'] + 1e-10)

    # Average uncertainty score (lower = better)
    uncertainty_ratios = [
        df.get('period_uncertainty_ratio', 0),
        df.get('radius_uncertainty_ratio', 0),
        df.get('stellar_radius_uncertainty_ratio', 0)
    ]
    df['average_measurement_quality'] = np.mean([u for u in uncertainty_ratios if isinstance(u, pd.Series)], axis=0)

    return df

[7] Creating Statistical Features...


In [15]:
# ========================================
# MAIN FEATURE ENGINEERING PIPELINE
# ========================================

def engineer_all_features(df):
    """
    Apply all feature engineering steps
    """
    print("\n" + "="*70)
    print("APPLYING ALL FEATURE ENGINEERING STEPS")
    print("="*70)

    original_features = df.shape[1]

    # Apply all transformations
    df = create_transit_features(df)
    df = create_planetary_features(df)
    df = create_stellar_features(df)
    df = create_combined_features(df)
    df = create_positional_features(df)
    df = create_false_positive_features(df)
    df = create_statistical_features(df)

    new_features = df.shape[1] - original_features

    print(f"\n Feature Engineering Complete!")
    print(f"   Original Features: {original_features}")
    print(f"   New Features Created: {new_features}")
    print(f"   Total Features: {df.shape[1]}")

    return df



In [16]:
# ========================================
# FEATURE IMPORTANCE ANALYSIS
# ========================================

def analyze_feature_importance(df, target_col='tfopwg_disp'):
    """
    Identify most important features for exoplanet detection
    """
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import LabelEncoder

    print("\n" + "="*70)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*70)

    # Prepare data
    # Remove non-numeric and identifier columns
    id_cols = ['toi', 'toipfx', 'tid', 'ctoi_alias', 'toi_created', 'rowupdate']
    numeric_df = df.select_dtypes(include=[np.number]).drop(columns=id_cols, errors='ignore')

    # Handle target variable
    if target_col in df.columns:
        le = LabelEncoder()
        y = le.fit_transform(df[target_col].fillna('Unknown'))
        X = numeric_df.drop(columns=[target_col], errors='ignore')
    else:
        print(" Target column not found. Skipping importance analysis.")
        return

    # Fill missing values
    X = X.fillna(X.median())

    # Train Random Forest for feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X, y)

    # Get feature importances
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n TOP 20 MOST IMPORTANT FEATURES:")
    print("-" * 70)
    for idx, row in importance_df.head(20).iterrows():
        print(f"  {row['feature']:<40} {row['importance']:.4f}")

    return importance_df


In [17]:
if __name__ == "__main__":
    # Apply feature engineering
    df_engineered = engineer_all_features(df)

    # Analyze feature importance
    feature_importance = analyze_feature_importance(df_engineered)

    # Display sample of new features
    print("\n" + "="*70)
    print("SAMPLE OF NEW FEATURES:")
    print("="*70)
    new_feature_cols = [col for col in df_engineered.columns if col not in df.columns]
    print(df_engineered[new_feature_cols[:10]].head())


APPLYING ALL FEATURE ENGINEERING STEPS

 Feature Engineering Complete!
   Original Features: 18
   New Features Created: 54
   Total Features: 72

FEATURE IMPORTANCE ANALYSIS

 TOP 20 MOST IMPORTANT FEATURES:
----------------------------------------------------------------------
  st_tmag                                  0.0525
  pl_tranmid                               0.0405
  detection_quality                        0.0387
  st_dist_log                              0.0320
  st_dist                                  0.0317
  transit_depth_anomaly                    0.0273
  pl_rade                                  0.0264
  pl_insol                                 0.0263
  proper_motion_total                      0.0248
  abs_dec                                  0.0247
  pl_eqt_celsius                           0.0247
  pl_trandurh                              0.0245
  pos_y                                    0.0244
  pl_eqt                                   0.0242
  pl_trandep       

In [18]:
# Select numerical columns for imputation
numerical_cols = df_engineered.select_dtypes(include=np.number).columns.tolist()
numerical_df = df_engineered[numerical_cols].copy()

# Select categorical columns for encoding
categorical_cols = df_engineered.select_dtypes(exclude=np.number).columns.tolist()

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df_engineered, columns=categorical_cols, dummy_na=False)

# Select only the numerical columns (including the newly created dummy variables) for imputation
numerical_cols_encoded = df_encoded.select_dtypes(include=np.number).columns.tolist()
numerical_df_encoded = df_encoded[numerical_cols_encoded].copy()

# Drop the target column before imputation
if target_col in numerical_df_encoded.columns:
    numerical_df_encoded = numerical_df_encoded.drop(columns=[target_col])

# Impute missing values using KNNImputer
imputer = KNNImputer(n_neighbors=5)
X = imputer.fit_transform(numerical_df_encoded)

In [19]:
le = LabelEncoder()
y = le.fit_transform(df[target_col])

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"Shape of X before oversampling: {X.shape}")
print(f"Shape of y before oversampling: {y.shape}")
print(f"Shape of X after oversampling: {X_resampled.shape}")
print(f"Shape of y after oversampling: {y_resampled.shape}")

Shape of X before oversampling: (7668, 66)
Shape of y before oversampling: (7668,)
Shape of X after oversampling: (28050, 66)
Shape of y after oversampling: (28050,)


In [21]:
# --- train/val/test split ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X_resampled, y_resampled, test_size=0.4, stratify=y_resampled, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Train set size: 16830
Validation set size: 5610
Test set size: 5610


In [22]:
cat_model = CatBoostClassifier(
    iterations=992,
    learning_rate=0.1477,
    depth=5,
    l2_leaf_reg=2.42,
    verbose=0,
    random_seed=42
)
xgb_model = XGBClassifier(
    n_estimators=654,
    learning_rate=0.0582,
    max_depth=7,
    subsample=0.8595,
    colsample_bytree=0.9961,
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42
)
lgbm_model = LGBMClassifier(
    n_estimators=536,
    learning_rate=0.01604,
    max_depth=10,
    num_leaves=84,
    min_data_in_leaf=100,
    subsample=0.9998,
    colsample_bytree=0.6115,
    random_state=42,
    verbosity=-1
)

In [23]:
# --- fit ---
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

In [24]:
w_cat, w_xgb, w_lgbm = 0.4, 0.35, 0.25

In [25]:
cat_pred_proba = cat_model.predict_proba(X_val)
xgb_pred_proba = xgb_model.predict_proba(X_val)
lgbm_pred_proba = lgbm_model.predict_proba(X_val)

In [26]:
ensemble_pred_proba = w_cat*cat_pred_proba + w_xgb*xgb_pred_proba + w_lgbm*lgbm_pred_proba
ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)

In [27]:
print("Ensemble ACCURACY on Validation Set:", accuracy_score(y_val, ensemble_pred))
print("Ensemble PRECISION on Validation Set:", precision_score(y_val, ensemble_pred, average="macro"))
print("Ensemble RECALL on Validation Set:", recall_score(y_val, ensemble_pred, average="macro"))
print("Ensemble F1 on Validation Set:", f1_score(y_val, ensemble_pred, average="macro"))

Ensemble ACCURACY on Validation Set: 0.9174688057040998
Ensemble PRECISION on Validation Set: 0.9177144194794824
Ensemble RECALL on Validation Set: 0.9174688057040999
Ensemble F1 on Validation Set: 0.9169421660073359


In [28]:
# --- Evaluate on Test Set ---
cat_pred_proba_test = cat_model.predict_proba(X_test)
xgb_pred_proba_test = xgb_model.predict_proba(X_test)
lgbm_pred_proba_test = lgbm_model.predict_proba(X_test)

ensemble_pred_proba_test = w_cat*cat_pred_proba_test + w_xgb*xgb_pred_proba_test + w_lgbm*lgbm_pred_proba_test
ensemble_pred_test = np.argmax(ensemble_pred_proba_test, axis=1)

print("\n--- Ensemble Metrics on Test Set ---")
print("ACCURACY:", accuracy_score(y_test, ensemble_pred_test))
print("PRECISION:", precision_score(y_test, ensemble_pred_test, average="macro"))
print("RECALL:", recall_score(y_test, ensemble_pred_test, average="macro"))
print("F1-SCORE:", f1_score(y_test, ensemble_pred_test, average="macro"))


--- Ensemble Metrics on Test Set ---
ACCURACY: 0.925668449197861
PRECISION: 0.9252571434390783
RECALL: 0.9256684491978611
F1-SCORE: 0.9252424048385746


In [29]:
# --- K-Fold OOF ensemble ---
kf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
fold_acc = []

In [30]:
for i, (train_idx, val_idx) in enumerate(kf.split(X_resampled, y_resampled), 1):
    X_tr, X_va = X_resampled[train_idx], X_resampled[val_idx]
    y_tr, y_va = y_resampled[train_idx], y_resampled[val_idx]

    cat_model.fit(X_tr, y_tr)
    xgb_model.fit(X_tr, y_tr)
    lgbm_model.fit(X_tr, y_tr)

    cat_p = cat_model.predict_proba(X_va)
    xgb_p = xgb_model.predict_proba(X_va)
    lgbm_p = lgbm_model.predict_proba(X_va)

    ensemble_p = w_cat*cat_p + w_xgb*xgb_p + w_lgbm*lgbm_p
    ensemble_pred = np.argmax(ensemble_p, axis=1)

    acc = accuracy_score(y_va, ensemble_pred)
    fold_acc.append(acc)
    print(f"Fold {i}: ACC = {acc:.4f}")

Fold 1: ACC = 0.9366
Fold 2: ACC = 0.9337
Fold 3: ACC = 0.9380
Fold 4: ACC = 0.9423
Fold 5: ACC = 0.9366
Fold 6: ACC = 0.9444
Fold 7: ACC = 0.9344
Fold 8: ACC = 0.9408
Fold 9: ACC = 0.9451
Fold 10: ACC = 0.9280
Fold 11: ACC = 0.9429
Fold 12: ACC = 0.9301
Fold 13: ACC = 0.9344
Fold 14: ACC = 0.9244
Fold 15: ACC = 0.9486
Fold 16: ACC = 0.9358
Fold 17: ACC = 0.9429
Fold 18: ACC = 0.9358
Fold 19: ACC = 0.9408
Fold 20: ACC = 0.9408


In [31]:
print("\n--- 20-Fold Summary ---")
print("Best Fold ACCURACY:", np.max(fold_acc))


--- 20-Fold Summary ---
Best Fold ACCURACY: 0.948644793152639
