In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv(r'D:\PIMA\data_2\cleaned_1.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,GenHlth_X_PhysHlth,BMI_X_PhysActivity,Age_X_HighBP,Income_X_Education,Age_X_DiffWalk,health_bp_decay,low_ses,condition_count,health_cluster,risk_score_mult
0,0,0.0,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,...,90.0,3.0,4.0,48.0,0.0,1.103638,0,1.0,3,4.68
1,1,0.0,1.0,1.0,1.0,3,1.0,1.0,0.0,0.0,...,0.0,0.0,12.0,48.0,0.0,1.103638,0,3.0,0,7.56


In [9]:
columns_to_convert = df.columns
df[columns_to_convert] = df[columns_to_convert].astype(int)

In [11]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score, classification_report
import catboost as cb
from sklearn.feature_selection import RFECV

data = df

# Define target and features
X = data.drop(['Unnamed: 0', 'Diabetes_binary'], axis=1)
y = data['Diabetes_binary']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify categorical features
cat_features = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 
    'GenHlth', 'Sex', 'Age', 'Education', 'Income',
    'metabolic_syndrome', 'low_ses', 'health_cluster'
]

# Get categorical feature indices
cat_features_indices = [X.columns.get_loc(col) for col in cat_features if col in X.columns]


# Custom scorer that optimizes for recall
def recall_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return recall_score(y, y_pred)


'''# OPTION 1: RFECV for automated feature selection
# ==============================================
def run_rfecv(min_features_to_select=5):
    # Define base model optimized for recall (use Recall as eval_metric)
    model = cb.CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Recall',  # Focus on recall
        random_seed=42,
        verbose=100
    )
    
    # Define cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Create RFECV object
    rfecv = RFECV(
        estimator=model,
        step=1,  # Remove one feature at a time
        cv=cv,
        scoring=recall_scorer,  # Our custom recall scorer
        min_features_to_select=min_features_to_select,
        n_jobs=-1
    )
    
    # Fit RFECV
    rfecv.fit(X_train, y_train, cat_features=cat_features_indices)
    
    # Plot number of features vs. recall
    plt.figure(figsize=(10, 6))
    plt.xlabel("Number of features selected")
    plt.ylabel("Recall")
    plt.plot(range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select), rfecv.grid_scores_)
    plt.title("Recursive Feature Elimination with Cross-Validation")
    plt.savefig('rfecv_plot.png')
    
    # Get selected features
    selected_features = X_train.columns[rfecv.support_]
    print(f"Selected {len(selected_features)} features: {list(selected_features)}")
    
    # Get selected categorical features
    selected_cat_features = [feature for feature in cat_features if feature in selected_features]
    selected_cat_indices = [list(selected_features).index(col) for col in selected_cat_features]
    
    # Train final model with selected features
    final_model = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Recall',  # Focus on recall
        random_seed=42,
        verbose=100
    )
    
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    
    final_model.fit(
        X_train_selected, y_train,
        cat_features=selected_cat_indices,
        eval_set=(X_test_selected, y_test),
        early_stopping_rounds=50
    )
    
    # Evaluate final model
    y_pred = final_model.predict(X_test_selected)
    print("\nFinal model performance:")
    print(classification_report(y_test, y_pred))
    
    # Show feature importance
    feature_importance = pd.DataFrame({
        'Feature': selected_features,
        'Importance': final_model.get_feature_importance()
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    return selected_features, final_model'''


# OPTION 2: Custom manual feature elimination to target a specific number of features
# ==============================================================================
def manual_feature_elimination(target_num_features=10):
    remaining_features = list(X_train.columns)
    feature_ranking = []
    
    # Continue until we reach the target number of features
    while len(remaining_features) > target_num_features:
        feature_importance = {}
        
        # For each remaining feature, try removing it and measure recall
        for feature_to_remove in remaining_features:
            # Features to use in this iteration
            current_features = [f for f in remaining_features if f != feature_to_remove]
            
            # Get indices of categorical features
            current_cat_features = [idx for idx, feature in enumerate(current_features) 
                                   if feature in cat_features]
            
            # Train model without this feature
            model = cb.CatBoostClassifier(
                iterations=300,  # Reduced iterations for faster execution
                learning_rate=0.05,
                depth=6,
                loss_function='Logloss',
                eval_metric='Recall',
                random_seed=42,
                verbose=0  # Silent mode for cleaner output
            )
            
            model.fit(
                X_train[current_features], y_train,
                cat_features=current_cat_features,
                eval_set=(X_test[current_features], y_test),
                early_stopping_rounds=20,
                verbose=False
            )
            
            # Predict and calculate recall
            y_pred = model.predict(X_test[current_features])
            recall = recall_score(y_test, y_pred)
            
            # Store feature's impact on recall
            feature_importance[feature_to_remove] = recall
        
        # Find the feature whose removal causes the smallest drop (or largest increase) in recall
        best_feature_to_remove = max(feature_importance, key=feature_importance.get)
        
        # Record the removed feature and its rank
        feature_ranking.append((best_feature_to_remove, len(remaining_features)))
        
        # Remove the feature
        remaining_features.remove(best_feature_to_remove)
        
        print(f"Removed feature: {best_feature_to_remove}, Remaining: {len(remaining_features)}, Recall: {feature_importance[best_feature_to_remove]:.4f}")
    
    # Calculate final model with the remaining features
    final_cat_features = [idx for idx, feature in enumerate(remaining_features) 
                         if feature in cat_features]
    
    final_model = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Recall',
        random_seed=42,
        verbose=100
    )
    
    final_model.fit(
        X_train[remaining_features], y_train,
        cat_features=final_cat_features,
        eval_set=(X_test[remaining_features], y_test),
        early_stopping_rounds=50
    )
    
    # Evaluate final model
    y_pred = final_model.predict(X_test[remaining_features])
    print("\nFinal model performance with top", len(remaining_features), "features:")
    print(classification_report(y_test, y_pred))
    
    # Feature importance of final model
    feature_importance = pd.DataFrame({
        'Feature': remaining_features,
        'Importance': final_model.get_feature_importance()
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importance of Final Model:")
    print(feature_importance)
    
    return remaining_features, final_model, feature_ranking


'''# OPTION 3: Use CatBoost's built-in feature importance for feature selection
# ========================================================================
def catboost_feature_importance_selection(target_num_features=10):
    # First train a model with all features to get baseline importance
    model = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Recall',  # Focus on recall
        random_seed=42,
        verbose=100
    )
    
    model.fit(
        X_train, y_train,
        cat_features=cat_features_indices,
        eval_set=(X_test, y_test),
        early_stopping_rounds=50
    )
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.get_feature_importance()
    }).sort_values(by='Importance', ascending=False)
    
    print("Feature Importance (All Features):")
    print(feature_importance)
    
    # Select top N features
    top_features = feature_importance.head(target_num_features)['Feature'].tolist()
    
    # Get categorical features among selected features
    top_cat_features = [idx for idx, feature in enumerate(top_features) 
                       if feature in cat_features]
    
    # Train final model with selected features
    final_model = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='Recall',  # Focus on recall
        random_seed=42,
        verbose=100
    )
    
    final_model.fit(
        X_train[top_features], y_train,
        cat_features=top_cat_features,
        eval_set=(X_test[top_features], y_test),
        early_stopping_rounds=50
    )
    
    # Evaluate final model
    y_pred = final_model.predict(X_test[top_features])
    print("\nFinal model performance with top", len(top_features), "features:")
    print(classification_report(y_test, y_pred))
    
    return top_features, final_model'''


# Choose which method to run
# Uncomment only one of these

# Method 1: Automated RFECV (often slowest but most thorough)
# selected_features, final_model = run_rfecv(min_features_to_select=10)

# Method 2: Manual feature elimination to a target number (balanced approach)
selected_features, final_model, feature_ranking = manual_feature_elimination(target_num_features=10)

# Method 3: Simple selection based on initial feature importance (fastest)
#selected_features, final_model = catboost_feature_importance_selection(target_num_features=10)

# Save the final model
final_model.save_model('catboost_diabetes_model.cbm')

# Print final selected features
print("\nFinal Selected Features:")
print(selected_features)

Removed feature: Age, Remaining: 32, Recall: 0.8116
Removed feature: condition_count, Remaining: 31, Recall: 0.8233
Removed feature: Income, Remaining: 30, Recall: 0.8416
Removed feature: GenHlth, Remaining: 29, Recall: 0.8339
Removed feature: Age_X_HighBP, Remaining: 28, Recall: 0.8336
Removed feature: Education, Remaining: 27, Recall: 0.8484
Removed feature: PhysActivity, Remaining: 26, Recall: 0.8345
Removed feature: BMI, Remaining: 25, Recall: 0.8310
Removed feature: health_cluster, Remaining: 24, Recall: 0.8314
Removed feature: GenHlth_X_PhysHlth, Remaining: 23, Recall: 0.8332
Removed feature: BMI_X_PhysActivity, Remaining: 22, Recall: 0.8287
Removed feature: HighChol, Remaining: 21, Recall: 0.8339
Removed feature: DiffWalk, Remaining: 20, Recall: 0.8386
Removed feature: CholCheck, Remaining: 19, Recall: 0.8287
Removed feature: BMI_X_Age, Remaining: 18, Recall: 0.8249
Removed feature: Age_X_DiffWalk, Remaining: 17, Recall: 0.7990
Removed feature: risk_score_mult, Remaining: 16, Re

In [12]:
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

# Create diverse CatBoost base models
base_models = [
    cb.CatBoostClassifier(iterations=1000, depth=4, learning_rate=0.05, verbose=100),
    cb.CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.03, verbose=100),
    cb.CatBoostClassifier(iterations=1000, depth=8, learning_rate=0.02, verbose=100),
    cb.CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.01, verbose=100),
    cb.CatBoostClassifier(iterations=1000, l2_leaf_reg=5, bagging_temperature=1, verbose=100)
]

# Train base models with cross-validation to create meta-features
def create_meta_features(models, X, y, X_test, n_folds=5):
    meta_train = np.zeros((X.shape[0], len(models)))
    meta_test = np.zeros((X_test.shape[0], len(models)))
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # For each model
    for i, model in enumerate(models):
        print(f"Training base model {i+1}/{len(models)}...")
        # Create test predictions as average of cross-validation folds
        test_preds = np.zeros((X_test.shape[0], n_folds))
        
        # For each fold
        for j, (train_idx, val_idx) in enumerate(kf.split(X)):
            print(f"  Fold {j+1}/{n_folds}")
            # Split data
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            # Train model
            model.fit(X_train_fold, y_train_fold)
            
            # Create meta-features for validation fold
            meta_train[val_idx, i] = model.predict_proba(X_val_fold)[:, 1]
            
            # Create meta-features for test data
            test_preds[:, j] = model.predict_proba(X_test)[:, 1]
            
            # Print fold performance
            val_preds = model.predict(X_val_fold)
            print(f"    Fold {j+1} - Accuracy: {accuracy_score(y_val_fold, val_preds):.4f}, Recall: {recall_score(y_val_fold, val_preds):.4f}")
        
        # Average test predictions across folds
        meta_test[:, i] = test_preds.mean(axis=1)
        
        # Train the model on all data for evaluation
        model.fit(X, y)
        full_preds = model.predict(X_test)
        
        # Print individual model performance
        print(f"\nBase model {i+1} performance:")
        print(f"Accuracy: {accuracy_score(y_test, full_preds):.4f}")
        print(f"Recall: {recall_score(y_test, full_preds):.4f}")
        print(f"Precision: {precision_score(y_test, full_preds):.4f}")
        print(f"F1 Score: {f1_score(y_test, full_preds):.4f}")
        print("Classification Report:")
        print(classification_report(y_test, full_preds))
        print("-" * 50)
    
    return meta_train, meta_test

# Generate meta-features
meta_train, meta_test = create_meta_features(base_models, X_train, y_train, X_test)

# Train meta-learner (another CatBoost model)
print("Training meta-learner...")
meta_learner = cb.CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=4,
    loss_function='Logloss',
    eval_metric='Recall',
    verbose=100
)

meta_learner.fit(meta_train, y_train)

# Make final predictions
final_predictions = meta_learner.predict(meta_test)
final_probs = meta_learner.predict_proba(meta_test)[:, 1]

# Compute and print final model performance
print("\n" + "="*50)
print("FINAL STACKED MODEL PERFORMANCE:")
print("="*50)
print(f"Accuracy: {accuracy_score(y_test, final_predictions):.4f}")
print(f"Recall: {recall_score(y_test, final_predictions):.4f}")
print(f"Precision: {precision_score(y_test, final_predictions):.4f}")
print(f"F1 Score: {f1_score(y_test, final_predictions):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, final_predictions))

# Check if we can improve recall by adjusting the threshold
thresholds = np.arange(0.3, 0.7, 0.05)
print("\nExploring different prediction thresholds:")
for threshold in thresholds:
    adj_preds = (final_probs >= threshold).astype(int)
    recall = recall_score(y_test, adj_preds)
    precision = precision_score(y_test, adj_preds)
    f1 = f1_score(y_test, adj_preds)
    print(f"Threshold {threshold:.2f}: Recall = {recall:.4f}, Precision = {precision:.4f}, F1 = {f1:.4f}")

# Compare against the best individual model
best_model_idx = np.argmax([recall_score(y_test, base_model.predict(X_test)) for base_model in base_models])
best_model_preds = base_models[best_model_idx].predict(X_test)
print("\nComparison with best individual model:")
print(f"Best individual model (#{best_model_idx+1}) - Recall: {recall_score(y_test, best_model_preds):.4f}")
print(f"Stacked model - Recall: {recall_score(y_test, final_predictions):.4f}")
print(f"Improvement: {recall_score(y_test, final_predictions) - recall_score(y_test, best_model_preds):.4f}")

Training base model 1/5...
  Fold 1/5
0:	learn: 0.6759855	total: 19ms	remaining: 19s
100:	learn: 0.5048659	total: 1.67s	remaining: 14.9s
200:	learn: 0.4998834	total: 2.92s	remaining: 11.6s
300:	learn: 0.4960723	total: 4.25s	remaining: 9.87s
400:	learn: 0.4931381	total: 5.5s	remaining: 8.22s
500:	learn: 0.4906391	total: 6.85s	remaining: 6.83s
600:	learn: 0.4883904	total: 8.06s	remaining: 5.35s
700:	learn: 0.4863323	total: 9.23s	remaining: 3.94s
800:	learn: 0.4843765	total: 10.6s	remaining: 2.63s
900:	learn: 0.4826040	total: 11.9s	remaining: 1.31s
999:	learn: 0.4809307	total: 13.4s	remaining: 0us
    Fold 1 - Accuracy: 0.7510, Recall: 0.7919
  Fold 2/5
0:	learn: 0.6761959	total: 12.5ms	remaining: 12.5s
100:	learn: 0.5062795	total: 1.24s	remaining: 11s
200:	learn: 0.5013602	total: 2.42s	remaining: 9.63s
300:	learn: 0.4976226	total: 3.64s	remaining: 8.45s
400:	learn: 0.4946314	total: 4.88s	remaining: 7.29s
500:	learn: 0.4921297	total: 6.11s	remaining: 6.08s
600:	learn: 0.4898493	total: 7.4