In [4]:
pip install --user catboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
import time # To time the training

# --- Step 1: Load Your PROCESSED Data ---
try:
    train_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './Data/' folder.")
    raise

# --- Step 2: Separate Features (X) and Target (y) ---
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

# --- Step 3: Initialize and Train the CatBoost Model ---
# We'll use default settings first, which are often strong.
# 'random_state=42' for reproducibility
# 'verbose=0' suppresses training output for cleaner results
cb_model = CatBoostClassifier(random_state=42, verbose=0)

print("\nTraining the CatBoost model...")
start_time = time.time()
cb_model.fit(X_train, y_train)
end_time = time.time()
print(f"Model training complete! Time taken: {end_time - start_time:.2f} seconds")

# --- Step 4: Evaluate the Model ---
print("\nEvaluating the model on the test set...")
y_pred_cb = cb_model.predict(X_test)

# Check accuracy
accuracy_cb = accuracy_score(y_test, y_pred_cb)
print(f"\nCatBoost Model Accuracy on Test Data: {accuracy_cb * 100:.2f}%")

# Get a detailed report
print("\nCatBoost Classification Report:")
print(classification_report(y_test, y_pred_cb,digits=4))

Training features shape: (16720, 56)
Testing features shape: (4180, 56)

Training the CatBoost model...
Model training complete! Time taken: 7.57 seconds

Evaluating the model on the test set...

CatBoost Model Accuracy on Test Data: 82.30%

CatBoost Classification Report:
              precision    recall  f1-score   support

         0.0     0.8521    0.8394    0.8457      2416
         1.0     0.7844    0.8005    0.7924      1764

    accuracy                         0.8230      4180
   macro avg     0.8183    0.8199    0.8190      4180
weighted avg     0.8235    0.8230    0.8232      4180



In [5]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import time
import optuna
import numpy as np
import sklearn

# Suppress Optuna's trial logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- Step 1: Load Your PROCESSED Data ---
try:
    train_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './Data/' folder.")
    raise

# --- Step 2: Separate Features (X) and Target (y) ---
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

# --- NEW: Calculate scale_pos_weight for imbalance ---
# CatBoost uses 'scale_pos_weight' just like XGBoost
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
print(f"\nCalculated scale_pos_weight for imbalance: {scale_pos_weight:.4f}")

# --- NEW: Step 2.5: Define Optuna Objective Function ---

def objective(trial):
    """
    This function will be called by Optuna for each trial.
    """
    
    # Define the hyperparameter search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True)
    }
    
    # Create the CatBoost model with suggested params
    model_cb = CatBoostClassifier(
        **params,
        scale_pos_weight=scale_pos_weight, # Handle imbalance
        random_state=42,
        verbose=0,
        early_stopping_rounds=50 # Use early stopping for speed during tuning
    )
    
    # Evaluate the model using cross-validation (f1_weighted for imbalance)
    # We use a simple fit/validate split here because CatBoost's early stopping
    # is more efficient than full K-Fold CV.
    # We'll split X_train again for this.
    from sklearn.model_selection import train_test_split
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

    model_cb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], use_best_model=True)
    
    preds = model_cb.predict(X_val)
    f1 = sklearn.metrics.f1_score(y_val, preds, average='weighted')
    
    return f1

# --- NEW: Step 2.6: Run Optuna Study ---
print("\nStarting Optuna optimization...")
start_time_optuna = time.time()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True) # Run 50 trials

end_time_optuna = time.time()
print(f"Optuna optimization complete! Time taken: {end_time_optuna - start_time_optuna:.2f} seconds")

# --- Store and Print the Best Parameters ---
best_params = study.best_params
print("\n--- Best Parameters Found by Optuna ---")
print(best_params)
print(f"Best F1-weighted score during tuning: {study.best_value:.4f}")
print("------------------------------------------")


# --- Step 3: Initialize and Train the FINAL CatBoost Model ---
# We use the best_params found by Optuna
# We also add iterations (if Optuna didn't find it) and the weight
final_params = best_params.copy()
if 'iterations' not in final_params:
    final_params['iterations'] = 1000 # Default high value if not tuned

cb_model = CatBoostClassifier(
    **final_params,
    scale_pos_weight=scale_pos_weight, # Use the weight for the final model
    random_state=42, 
    verbose=0
)

print("\nTraining the FINAL CatBoost model with best params...")
start_time = time.time()
# Train on the FULL training data
cb_model.fit(X_train, y_train)
end_time = time.time()
print(f"Final model training complete! Time taken: {end_time - start_time:.2f} seconds")

# --- Step 4: Evaluate the FINAL Model ---
print("\nEvaluating the FINAL model on the test set...")
y_pred_cb = cb_model.predict(X_test)

# Check accuracy
accuracy_cb = accuracy_score(y_test, y_pred_cb)
print(f"\nFinal CatBoost Model Accuracy on Test Data: {accuracy_cb * 100:.2f}%")

# Get a detailed report
print("\nFinal CatBoost Classification Report:")
print(classification_report(y_test, y_pred_cb, digits=4))

Training features shape: (16720, 56)
Testing features shape: (4180, 56)

Calculated scale_pos_weight for imbalance: 1.3882

Starting Optuna optimization...


  0%|          | 0/50 [00:00<?, ?it/s]

Optuna optimization complete! Time taken: 111.08 seconds

--- Best Parameters Found by Optuna ---
{'iterations': 659, 'depth': 6, 'learning_rate': 0.052427517752247396, 'l2_leaf_reg': 0.04996102703771198, 'border_count': 144, 'random_strength': 0.0049402307214256435}
Best F1-weighted score during tuning: 0.8297
------------------------------------------

Training the FINAL CatBoost model with best params...
Final model training complete! Time taken: 4.03 seconds

Evaluating the FINAL model on the test set...

Final CatBoost Model Accuracy on Test Data: 81.51%

Final CatBoost Classification Report:
              precision    recall  f1-score   support

         0.0     0.8709    0.7984    0.8331      2416
         1.0     0.7522    0.8379    0.7927      1764

    accuracy                         0.8151      4180
   macro avg     0.8115    0.8181    0.8129      4180
weighted avg     0.8208    0.8151    0.8160      4180



In [10]:
print(scale_pos_weight)

1.3882302528210255


In [6]:
best_params = final_params

In [7]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
import time

print("--- 1. Testing CatBoost with Class Weight ---")

# --- Load Data ---
try:
    train_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_full = train_df.drop('label', axis=1)
y_full = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

# --- Calculate Weight ---
scale_pos_weight = len(y_full[y_full == 0]) / len(y_full[y_full == 1])
print(f"Using scale_pos_weight: {scale_pos_weight:.4f}")

# --- Assume 'best_params' (plural) variable exists from Optuna run ---
# best_params = {'iterations': ..., 'depth': ..., ...}

# --- Create and Train Model ---
cb_model = CatBoostClassifier(
    **best_params,
    scale_pos_weight=scale_pos_weight, # 👈 Add weight
    random_state=42,
    verbose=0
)

cb_model.fit(X_full, y_full) # Train on original data
print("Model training complete.")

# --- Evaluate ---
y_pred = cb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pd.DataFrame(report).transpose()

print("\nCatBoost (Tuned + Class Weight) Report:")
print(df_report)

--- 1. Testing CatBoost with Class Weight ---
Using scale_pos_weight: 1.3882
Model training complete.

CatBoost (Tuned + Class Weight) Report:
              precision    recall  f1-score      support
0.0            0.870880  0.798427  0.833081  2416.000000
1.0            0.752163  0.837868  0.792706  1764.000000
accuracy       0.815072  0.815072  0.815072     0.815072
macro avg      0.811522  0.818148  0.812894  4180.000000
weighted avg   0.820780  0.815072  0.816043  4180.000000


In [8]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing CatBoost with SMOTETomek ---")

# --- Load Data ---
try:
    train_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_full = train_df.drop('label', axis=1)
y_full = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Assume 'best_params' (plural) variable exists ---
# best_params = {'iterations': ..., 'depth': ..., ...}

# --- Create and Train Model ---
cb_model = CatBoostClassifier(
    **best_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    verbose=0
)

cb_model.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = cb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pd.DataFrame(report).transpose()

print("\nCatBoost (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing CatBoost with SMOTETomek ---
Applying SMOTETomek...


[WinError 2] The system cannot find the file specified
  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


New resampled label distribution:
label
1.0    9220
0.0    9220
Name: count, dtype: int64
Model training complete.

CatBoost (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.859348  0.829470  0.844145  2416.000000
1.0            0.777056  0.814059  0.795127  1764.000000
accuracy       0.822967  0.822967  0.822967     0.822967
macro avg      0.818202  0.821765  0.819636  4180.000000
weighted avg   0.824620  0.822967  0.823459  4180.000000
