In [1]:
#import sys
#sys.executable
#!pip show scikit-optimize
#%pip install scikit-optimize
#%pip install catboost
#!pip show catboost
#%pip install --upgrade numpy

import numpy as np
from catboost import CatBoostClassifier

print("NumPy version:", np.__version__)
print("CatBoost imported successfully!")


NumPy version: 1.26.0
CatBoost imported successfully!


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# ***LOAD THE DATASETS***

train_path = "train.csv"
test_path = "test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# ***INITIAL PROCESSING***

# Separate features and target variable
X = train_df.drop(columns=['Id', 'label'])
y = train_df['label']

# Identify categorical and numerical features
categorical_features = [col for col in X.columns if 'categorical' in col]
numerical_features = [col for col in X.columns if 'numerical' in col]

# Standardize numerical features (Using MinMaxScaler instead of standardization)
scaler = MinMaxScaler()
X_train_num_df = scaler.fit_transform(X[numerical_features])
X_test_num_df = scaler.transform(test_df[numerical_features])

# One-hot encode categorical features
X_train_cat_df = pd.get_dummies(X[categorical_features], drop_first=True)
X_test_cat_df = pd.get_dummies(test_df[categorical_features], drop_first=True)

# Align categorical features across datasets (ensure all datasets have same columns)
X_train_cat_df, X_test_cat_df = X_train_cat_df.align(X_test_cat_df, join="left", axis=1, fill_value=0)

# Concatenate numerical and categorical features
X_train_processed = pd.concat([pd.DataFrame(X_train_num_df), X_train_cat_df], axis=1)
X_test_processed = pd.concat([pd.DataFrame(X_test_num_df), X_test_cat_df], axis=1)

# ***K-Fold Cross-Validation and Hyperparameter Tuning with Bayesian Optimization***

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the hyperparameter search space
param_space = {
    'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    'depth': Integer(6, 10),
    'iterations': Integer(300, 800),
    'subsample': Real(0.7, 1.0),
    'colsample_bylevel': Real(0.7, 1.0),
    'l2_leaf_reg': Real(1, 10),
    'border_count': Integer(32, 128),
    'random_strength': Real(0, 1)
}

# Initialize CatBoost classifier
cat_model = CatBoostClassifier(
    random_seed=42,
    eval_metric="AUC",
    verbose=0  # Silent mode for cleaner logs
)

# Perform Bayesian search with cross-validation
bayes_search = BayesSearchCV(
    estimator=cat_model,
    search_spaces=param_space,
    n_iter=10,  # Number of iterations
    cv=kf,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)

# Fit the model with Bayesian optimization
bayes_search.fit(X_train_processed, y)

# Get the best hyperparameters and model
best_params = bayes_search.best_params_
best_model = bayes_search.best_estimator_

print(f"Best Hyperparameters: {best_params}")

# ***TRAINING AND EVALUATION***

# List to store AUC scores for each fold
auc_scores = []

# Manually perform K-fold cross-validation
for train_index, val_index in kf.split(X_train_processed, y):
    X_train_fold, X_val_fold = X_train_processed.iloc[train_index], X_train_processed.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Train the model with early stopping and evaluation set
    best_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],  # Validation fold
        early_stopping_rounds=10,  # Stop if no improvement for 10 rounds
        verbose=50
    )

    # Predict probabilities for the validation set (validation fold)
    y_val_probs = best_model.predict_proba(X_val_fold)[:, 1]

    # Compute AUC score for the current fold
    auc_score = roc_auc_score(y_val_fold, y_val_probs)
    auc_scores.append(auc_score)

# Average AUC score across all folds
average_auc = np.mean(auc_scores)
print(f"Average AUC Score (K-Fold CV): {average_auc:.4f}")

# ***EVALUATION ON FULL TRAINING DATA***

# Predict probabilities for the entire training set using the best model
y_train_probs = best_model.predict_proba(X_train_processed)[:, 1]

# Compute AUC score on the full training set
train_auc_score = roc_auc_score(y, y_train_probs)
print(f"Training AUC Score: {train_auc_score:.4f}")

# ***EVALUATION ON TEST SET***

# Predict probabilities for the test set
test_probabilities = best_model.predict_proba(X_test_processed)[:, 1]

# Save test predictions
test_results = pd.DataFrame({
    'Id': test_df['Id'],
    'label': test_probabilities
})
test_results.to_csv("test_predictions.csv", index=False)  # Save the predictions to a CSV file

print("Test probabilities saved to test_predictions.csv")
