In [16]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.metrics import roc_auc_score
import catboost as cb
import xgboost as xgb
import lightgbm as lgb
import optuna
from scipy import stats
from itertools import combinations

warnings.filterwarnings('ignore')
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("libraries loaded")

libraries loaded


In [17]:
# Load data
train = pd.read_csv('/kaggle/input/lateral-ai-academy-kaggle-competition/train.csv')
test = pd.read_csv('/kaggle/input/lateral-ai-academy-kaggle-competition/test.csv')
sample_submission = pd.read_csv('/kaggle/input/lateral-ai-academy-kaggle-competition/sample_submission.csv')

# Critical alignment
train['Tenure'] = np.clip(train['Tenure'], 0, 10)

print(f"Data loaded: {train.shape[0]} train, {test.shape[0]} test")
print(f"Sample submission shape: {sample_submission.shape}")
print(f"Sample submission columns: {list(sample_submission.columns)}")

# Verify ID alignment
assert len(sample_submission) == len(test), "Sample submission and test size mismatch!"
print("ID alignment verified")

Data loaded: 15000 train, 10000 test
Sample submission shape: (10000, 2)
Sample submission columns: ['id', 'Exited']
ID alignment verified


In [18]:
def targeted_leak_fix(train_df, test_df):

    def apply_targeted_fix(train_df, test_df):
        train_result = train_df.copy()
        test_result = test_df.copy()
        
        # Combine for global rankings
        combined_df = pd.concat([train_df, test_df], ignore_index=True)
        
        print("Applying feature engineering with targeted fixes")
    
        # Age features 
        combined_df['age_rank_global'] = combined_df['Age'].rank(pct=True)
        combined_df['age_bins_5'] = pd.cut(combined_df['Age'], bins=5, labels=[0,1,2,3,4]).astype(int)
        combined_df['age_bins_10'] = pd.cut(combined_df['Age'], bins=10, labels=range(10)).astype(int)
        combined_df['age_squared'] = combined_df['Age'] ** 2
        combined_df['age_cubed'] = combined_df['Age'] ** 3
        combined_df['age_log'] = np.log1p(combined_df['Age'])
        combined_df['age_sqrt'] = np.sqrt(combined_df['Age'])
        combined_df['age_young'] = (combined_df['Age'] < 30).astype(int)
        combined_df['age_adult'] = ((combined_df['Age'] >= 30) & (combined_df['Age'] < 40)).astype(int)
        combined_df['age_middle'] = ((combined_df['Age'] >= 40) & (combined_df['Age'] < 50)).astype(int)
        combined_df['age_senior'] = (combined_df['Age'] >= 50).astype(int)
        combined_df['age_high_risk'] = (combined_df['Age'] >= 45).astype(int)
        
        # Balance features
        combined_df['balance_rank_global'] = combined_df['Balance'].rank(pct=True)
        combined_df['balance_log'] = np.log1p(combined_df['Balance'])
        combined_df['balance_sqrt'] = np.sqrt(combined_df['Balance'])
        combined_df['is_zero_balance'] = (combined_df['Balance'] == 0).astype(int)
        combined_df['balance_high'] = (combined_df['Balance'] > 100000).astype(int)
        combined_df['balance_medium'] = ((combined_df['Balance'] > 50000) & (combined_df['Balance'] <= 100000)).astype(int)
        combined_df['balance_low'] = ((combined_df['Balance'] > 0) & (combined_df['Balance'] <= 50000)).astype(int)
        
        # Credit features - FIXED 
        combined_df['credit_rank_global'] = combined_df['CreditScore'].rank(pct=True)
        
        # Salary features - FIXED   
        combined_df['salary_rank_global'] = combined_df['EstimatedSalary'].rank(pct=True)
        
        # Tenure features - FIXED 
        combined_df['tenure_rank_global'] = combined_df['Tenure'].rank(pct=True)
        
        # NumOfProducts patterns (safe)
        combined_df['single_product'] = (combined_df['NumOfProducts'] == 1).astype(int)
        combined_df['dual_product'] = (combined_df['NumOfProducts'] == 2).astype(int)
        combined_df['multi_product'] = (combined_df['NumOfProducts'] >= 3).astype(int)
        
        # Interactions using GLOBAL RANKINGS
        combined_df['age_balance_interaction'] = combined_df['age_rank_global'] * combined_df['balance_rank_global']
        combined_df['age_products_interaction'] = combined_df['Age'] * combined_df['NumOfProducts']
        combined_df['age_active_interaction'] = combined_df['Age'] * combined_df['IsActiveMember']
        combined_df['age_credit_interaction'] = combined_df['Age'] * combined_df['CreditScore']
        combined_df['balance_products_interaction'] = np.log1p(combined_df['Balance']) * combined_df['NumOfProducts']
        combined_df['balance_active_interaction'] = np.log1p(combined_df['Balance']) * combined_df['IsActiveMember']
        
        # Ratios (safe)
        combined_df['balance_per_product'] = combined_df['Balance'] / (combined_df['NumOfProducts'] + 1)
        combined_df['balance_age_ratio'] = combined_df['Balance'] / (combined_df['Age'] + 1)
        combined_df['credit_age_ratio'] = combined_df['CreditScore'] / (combined_df['Age'] + 1)
        combined_df['salary_balance_ratio'] = (combined_df['EstimatedSalary'] + 1) / (combined_df['Balance'] + 1)
        
        # High-risk segments (safe)
        combined_df['senior_single_product'] = ((combined_df['Age'] >= 45) & (combined_df['NumOfProducts'] == 1)).astype(int)
        combined_df['high_balance_single'] = ((combined_df['Balance'] > 100000) & (combined_df['NumOfProducts'] == 1)).astype(int)
        combined_df['senior_inactive'] = ((combined_df['Age'] >= 50) & (combined_df['IsActiveMember'] == 0)).astype(int)
        combined_df['young_multi_product'] = ((combined_df['Age'] < 35) & (combined_df['NumOfProducts'] >= 2)).astype(int)
        combined_df['middle_age_high_balance'] = ((combined_df['Age'] >= 40) & (combined_df['Age'] < 55) & (combined_df['Balance'] > 75000)).astype(int)
        
        # Tenure patterns using global ranking
        combined_df['tenure_age_ratio'] = combined_df['tenure_rank_global'] / (combined_df['age_rank_global'] + 0.01)
        combined_df['new_customer'] = (combined_df['Tenure'] <= 2).astype(int)
        combined_df['loyal_customer'] = (combined_df['Tenure'] >= 7).astype(int)
        
        # Quantile transformations using global data
        from sklearn.preprocessing import QuantileTransformer
        qt = QuantileTransformer(output_distribution='normal', random_state=42)
        combined_df['age_quantile_norm'] = qt.fit_transform(combined_df[['Age']]).flatten()
        combined_df['balance_quantile_norm'] = qt.fit_transform(combined_df[['Balance']]).flatten()
        combined_df['credit_quantile_norm'] = qt.fit_transform(combined_df[['CreditScore']]).flatten()
        
        # Polynomial combinations using global rankings
        combined_df['age_balance_poly'] = combined_df['age_rank_global'] * combined_df['balance_rank_global']
        combined_df['age_products_balance'] = combined_df['Age'] * combined_df['NumOfProducts'] * np.log1p(combined_df['Balance']) / 1000
        
        # Categorical encoding - CAREFUL ITS SO RETARDED
        from sklearn.preprocessing import LabelEncoder
        le_geo = LabelEncoder()
        le_gender = LabelEncoder()
        combined_df['Geography_encoded'] = le_geo.fit_transform(combined_df['Geography'])
        combined_df['Gender_encoded'] = le_gender.fit_transform(combined_df['Gender'])
        
        # Target encoding - VERY heavy smoothing to prevent leakage
        train_n = len(train_df)
        train_target = train_df['Exited']
        global_mean = train_target.mean()
        
        for col in ['Geography', 'Gender']:
            category_stats = train_df.groupby(col)['Exited'].agg(['mean', 'count'])
            
            smoothing = 500  # heavier than before
            
            category_encoding = {}
            for category in combined_df[col].unique():
                if category in category_stats.index:
                    cat_mean = category_stats.loc[category, 'mean']
                    cat_count = category_stats.loc[category, 'count']
                    smoothed_mean = (cat_mean * cat_count + global_mean * smoothing) / (cat_count + smoothing)
                else:
                    smoothed_mean = global_mean
                
                category_encoding[category] = smoothed_mean
            
            combined_df[f'{col}_target_enc'] = combined_df[col].map(category_encoding)
        
        # Split back
        train_result = combined_df.iloc[:train_n].copy()
        test_result = combined_df.iloc[train_n:].copy()
        
        return train_result, test_result
    
    train_fixed, test_fixed = apply_targeted_fix(train_df, test_df)
    
    # Remove the ID leak entirely
    feature_cols = [col for col in train_fixed.columns 
                   if col not in ['Exited', 'CustomerId', 'Surname', 'Geography', 'Gender', 'id']]
    
    X_train_fixed = train_fixed[feature_cols].fillna(0)
    X_test_fixed = test_fixed[feature_cols].fillna(0)
    
    # Clean data
    for col in X_train_fixed.columns:
        X_train_fixed[col] = pd.to_numeric(X_train_fixed[col], errors='coerce')
        X_test_fixed[col] = pd.to_numeric(X_test_fixed[col], errors='coerce')
    
    X_train_fixed = X_train_fixed.fillna(0).replace([np.inf, -np.inf], 0)
    X_test_fixed = X_test_fixed.fillna(0).replace([np.inf, -np.inf], 0)
    
    print(f"Feature count after targeted fix: {len(feature_cols)}")
    
    return train_fixed, test_fixed, X_train_fixed, X_test_fixed, feature_cols

# Apply
train_targeted, test_targeted, X_train_targeted, X_test_targeted, feature_names = targeted_leak_fix(train, test)

Applying feature engineering with targeted fixes
Feature count after targeted fix: 60


In [20]:
def test_predictive_power_with_current_fix(X_train, X_test, y_train):
    """
    Test if 0.8092 adversarial AUC is acceptable for maintaining predictive power.
    Sometimes slight distribution differences don't hurt leaderboard performance.
    """
    
    print("=== TESTING PREDICTIVE POWER WITH 0.8092 ADVERSARIAL AUC ===")
    
    # Use your original best parameters
    best_params = {
        'iterations': 1270,
        'depth': 4,
        'learning_rate': 0.010092848351022131,
        'l2_leaf_reg': 12.908182021977488,
        'border_count': 128,
        'bagging_temperature': 0.4859047927134544,
        'random_strength': 0.6121265192307465,
        'random_state': 42,
        'verbose': False
    }
    
    # 5-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    oof_predictions = np.zeros(len(X_train))
    
    print("Running 5-fold cross-validation with leak-fixed features...")
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        fold_model = cb.CatBoostClassifier(**best_params)
        
        fold_model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        val_pred = fold_model.predict_proba(X_train.iloc[val_idx])[:, 1]
        oof_predictions[val_idx] = val_pred
        
        score = roc_auc_score(y_train.iloc[val_idx], val_pred)
        cv_scores.append(score)
        print(f"Fold {fold + 1}: {score:.4f}")
    
    mean_cv = np.mean(cv_scores)
    std_cv = np.std(cv_scores)
    overall_cv = roc_auc_score(y_train, oof_predictions)
    
    print(f"\nRESULTS WITH LEAK-FIXED FEATURES:")
    print(f"Fold CV: {mean_cv:.4f} ± {std_cv:.4f}")
    print(f"Overall CV: {overall_cv:.4f}")
    print(f"Original CV: 0.9345")
    print(f"Performance change: {overall_cv - 0.9345:.4f}")
    print(f"Adversarial AUC: 0.8092 (improved from 1.0000)")
    
    # Decision logic
    performance_loss = 0.9345 - overall_cv
    
    if performance_loss < 0.008:  # Less than 0.008 loss is acceptable
        print(f"\nDECISION: ACCEPTABLE PERFORMANCE LOSS ({performance_loss:.4f})")
        print("Proceeding with leak-fixed features...")
        
        # Train final model
        final_model = cb.CatBoostClassifier(**best_params)
        print("Training final model on all data...")
        final_model.fit(X_train, y_train)
        
        # Generate predictions
        final_predictions = final_model.predict_proba(X_test)[:, 1]
        final_predictions = np.clip(final_predictions, 0.001, 0.999)
        
        # Create submission
        submission = sample_submission.copy()
        submission['Exited'] = final_predictions
        submission.to_csv('leak_fixed_submission.csv', index=False)
        
        print(f"\nSUBMISSION CREATED:")
        print(f"  CV Score: {overall_cv:.4f}")
        print(f"  Adversarial AUC: 0.8092")
        print(f"  Expected LB range: {overall_cv - 0.004:.4f} to {overall_cv + 0.002:.4f}")
        print(f"  Target to beat: 0.93561")
        
        # Check if likely to beat target
        conservative_lb = overall_cv - 0.004
        optimistic_lb = overall_cv + 0.002
        
        if conservative_lb > 0.93561:
            print("  VERY LIKELY TO BEAT TARGET!")
        elif optimistic_lb > 0.93561:
            print("  LIKELY TO BEAT TARGET!")
        else:
            print("  MAY NOT BEAT TARGET - consider more optimization")
        
        print(f"\nPrediction statistics:")
        print(f"  Range: [{final_predictions.min():.4f}, {final_predictions.max():.4f}]")
        print(f"  Mean: {final_predictions.mean():.4f}")
        print("First 5 predictions:")
        print(submission.head())
        
        return True, overall_cv, final_predictions
        
    else:
        print(f"\nDECISION: PERFORMANCE LOSS TOO HIGH ({performance_loss:.4f})")
        print("Need more aggressive optimization or alternative approaches")
        return False, overall_cv, None

# Test with current fix
success, cv_score, predictions = test_predictive_power_with_current_fix(
    X_train_targeted, X_test_targeted, train['Exited']
)

=== TESTING PREDICTIVE POWER WITH 0.8092 ADVERSARIAL AUC ===
Running 5-fold cross-validation with leak-fixed features...
Fold 1: 0.9257
Fold 2: 0.9271
Fold 3: 0.9486
Fold 4: 0.9350
Fold 5: 0.9369

RESULTS WITH LEAK-FIXED FEATURES:
Fold CV: 0.9347 ± 0.0082
Overall CV: 0.9346
Original CV: 0.9345
Performance change: 0.0001
Adversarial AUC: 0.8092 (improved from 1.0000)

DECISION: ACCEPTABLE PERFORMANCE LOSS (-0.0001)
Proceeding with leak-fixed features...
Training final model on all data...

SUBMISSION CREATED:
  CV Score: 0.9346
  Adversarial AUC: 0.8092
  Expected LB range: 0.9306 to 0.9366
  Target to beat: 0.93561
  LIKELY TO BEAT TARGET!

Prediction statistics:
  Range: [0.0022, 0.9977]
  Mean: 0.2023
First 5 predictions:
      id    Exited
0  15000  0.887699
1  15001  0.052068
2  15002  0.021671
3  15003  0.015158
4  15004  0.100993


In [24]:
import os

file_path = '/kaggle/working/lyabababad_submission.csv'

# Check if file exists before removing
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been deleted.")
else:
    print(f"{file_path} does not exist.")

/kaggle/working/lyabababad_submission.csv has been deleted.
