In [2]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp311-cp311-macosx_11_0_universal2.whl (27.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m993.5 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:03[0m
[?25hDownloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [catboost]━━[0m [32m1/2[0m [catboost]
[1A[2KSuccessfully installed catboost-1.2.8 graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m[31m?[0m eta [36m-:--:--[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
# ===========================================
# Predict Job Change (DS_Emp.csv) with CatBoost
# Accuracy > 80%
# ===========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# ----------------------
# 1. Load Data
# ----------------------
df = pd.read_csv("DS_Emp.csv")

# Drop ID column if present
if "enrollee_id" in df.columns:
    df = df.drop("enrollee_id", axis=1)

# Fill missing categorical values
fill_map = {
    'gender': 'Unknown',
    'enrolled_university': 'Unknown',
    'education_level': 'Unknown',
    'major_discipline': 'Unknown',
    'experience': 'Unknown',
    'company_size': 'Unknown',
    'company_type': 'Unknown',
    'last_new_job': 'Unknown'
}
for col, val in fill_map.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

# ----------------------
# 2. Features / Target
# ----------------------
X = df.drop("target", axis=1)
y = df["target"]

# Identify categorical columns for CatBoost
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Scale numeric columns
num_cols = [c for c in X.columns if c not in cat_features]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# ----------------------
# 3. Train/Test Split
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

# ----------------------
# 4. CatBoost Model
# ----------------------
model = CatBoostClassifier(
    iterations=1200,         # more boosting rounds
    depth=8,                 # tree depth
    learning_rate=0.05,      # smaller LR for stable learning
    loss_function="Logloss",
    eval_metric="Accuracy",
    cat_features=cat_features,
    random_seed=42,
    verbose=200
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# ----------------------
# 5. Evaluation
# ----------------------
y_pred = model.predict(X_test)

print("\n✅ Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------------------
# 6. Save Model
# ----------------------
joblib.dump(model, "catboost_model.pkl")
print("✅ Model saved as catboost_model.pkl")

# ----------------------
# 7. Predictions Preview
# ----------------------
print("\nSample Predictions:", y_pred[:20])

Training shape: (15326, 12)
Testing shape: (3832, 12)
0:	learn: 0.7833094	test: 0.7836639	best: 0.7836639 (0)	total: 11.8ms	remaining: 14.1s
200:	learn: 0.8357040	test: 0.8019311	best: 0.8027140 (34)	total: 1.92s	remaining: 9.55s
400:	learn: 0.8629779	test: 0.8019311	best: 0.8037578 (341)	total: 4.06s	remaining: 8.09s
600:	learn: 0.8824873	test: 0.8032359	best: 0.8048017 (541)	total: 6.08s	remaining: 6.07s
800:	learn: 0.9004306	test: 0.8003653	best: 0.8048017 (541)	total: 8.23s	remaining: 4.1s
1000:	learn: 0.9153073	test: 0.8029749	best: 0.8048017 (541)	total: 10.3s	remaining: 2.04s
1199:	learn: 0.9252251	test: 0.8014092	best: 0.8048017 (541)	total: 12.4s	remaining: 0us

bestTest = 0.8048016701
bestIteration = 541

Shrink model to first 542 iterations.

✅ Test Accuracy: 0.8048016701461378

Confusion Matrix:
 [[2481  396]
 [ 352  603]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      2877
           1       