In [None]:
pip install catboost

In [None]:
pip install lightgbm


In [17]:
# ===========================================
# Predict Job Change (DS_Emp.csv) with CatBoost
# Accuracy > 80%
# ===========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# ----------------------
# 1. Load Data
# ----------------------
df = pd.read_csv("../datasets/DS_Emp.csv")

# Drop ID column if present
if "enrollee_id" in df.columns:
    df = df.drop("enrollee_id", axis=1)

# Fill missing categorical values
fill_map = {
    'gender': 'Unknown',
    'enrolled_university': 'Unknown',
    'education_level': 'Unknown',
    'major_discipline': 'Unknown',
    'experience': 'Unknown',
    'company_size': 'Unknown',
    'company_type': 'Unknown',
    'last_new_job': 'Unknown'
}
for col, val in fill_map.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

# ----------------------
# 2. Features / Target
# ----------------------
X = df.drop("target", axis=1)
y = df["target"]

# Identify categorical columns for CatBoost
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Scale numeric columns
num_cols = [c for c in X.columns if c not in cat_features]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# ----------------------
# 3. Train/Test Split
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

# ----------------------
# 4. CatBoost Model
# ----------------------
model = CatBoostClassifier(
    iterations=1200,         # more boosting rounds
    depth=8,                 # tree depth
    learning_rate=0.05,      # smaller LR for stable learning
    loss_function="Logloss",
    eval_metric="Accuracy",
    cat_features=cat_features,
    random_seed=42,
    verbose=200
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# ----------------------
# 5. Evaluation
# ----------------------
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------------------
# 6. Save Model
# ----------------------
joblib.dump(model, "catboost_model.pkl")
print("✅ Model saved as catboost_model.pkl")

# ----------------------
# 7. Predictions Preview
# ----------------------
print("\nSample Predictions:", y_pred[:20])

Training shape: (15326, 12)
Testing shape: (3832, 12)
0:	learn: 0.7833094	test: 0.7836639	best: 0.7836639 (0)	total: 11.9ms	remaining: 14.2s
200:	learn: 0.8357040	test: 0.8019311	best: 0.8027140 (34)	total: 2.05s	remaining: 10.2s
400:	learn: 0.8629779	test: 0.8019311	best: 0.8037578 (341)	total: 4.25s	remaining: 8.48s
600:	learn: 0.8824873	test: 0.8032359	best: 0.8048017 (541)	total: 6.45s	remaining: 6.43s
800:	learn: 0.9004306	test: 0.8003653	best: 0.8048017 (541)	total: 8.64s	remaining: 4.3s
1000:	learn: 0.9153073	test: 0.8029749	best: 0.8048017 (541)	total: 10.8s	remaining: 2.15s
1199:	learn: 0.9252251	test: 0.8014092	best: 0.8048017 (541)	total: 13.1s	remaining: 0us

bestTest = 0.8048016701
bestIteration = 541

Shrink model to first 542 iterations.

✅ Test Accuracy: 0.8048016701461378

Confusion Matrix:
 [[2481  396]
 [ 352  603]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      2877
           1       

Making a Predictive System

In [18]:
df.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Unknown,Unknown,1,36,1
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,city_21,0.624,Unknown,No relevent experience,Full time course,Graduate,STEM,5,Unknown,Unknown,never,83,0
3,city_115,0.789,Unknown,No relevent experience,Unknown,Graduate,Business Degree,<1,Unknown,Pvt Ltd,never,52,1
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


In [19]:
# Feature importance
import matplotlib.pyplot as plt

importances = model.get_feature_importance(prettified=True)
print(importances)

top_n = 10
top_features = importances.sort_values(by="Importances", ascending=False).head(top_n)



                Feature Id  Importances
0                     city    12.778428
1             company_size    12.694032
2               experience    10.553802
3   city_development_index    10.084312
4             last_new_job     9.483183
5           training_hours     8.904599
6             company_type     8.888123
7          education_level     7.977477
8      enrolled_university     5.714343
9         major_discipline     5.459459
10                  gender     4.729566
11     relevent_experience     2.732677


In [20]:
# ----------------------
# 8. Making a Predictive System (Job Change Prediction)
# ----------------------

# Example input data (one employee's profile)
# Format must match features in training set
input_data = {
   "city": "city_21",                   # less developed city
    "city_development_index": 0.45,      # low development index
    "gender": "Female",
    "relevent_experience": "No relevent experience",
    "enrolled_university": "Full time course",
    "education_level": "Graduate",
    "major_discipline": "Other",
    "experience": "2",                   # very low experience
    "company_size": "<10",               # very small company
    "company_type": "Startup",
    "last_new_job": ">4",                # stuck for years
    "training_hours": 8    
}

# Convert to DataFrame (CatBoost needs same format as training data)
input_df = pd.DataFrame([input_data])

# Scale numeric columns same as training
input_df[num_cols] = scaler.transform(input_df[num_cols])

# Predict
prediction = model.predict(input_df)

print("🔮 Prediction:", prediction[0])

if prediction[0] == 0:
    print("❌ The candidate is unlikely to change jobs.")
else:
    print("✅ The candidate is likely to change jobs.")

🔮 Prediction: 1
✅ The candidate is likely to change jobs.


In [21]:
import pickle


In [22]:

# Save the trained CatBoost model
filename = 'dsJobPrediction_model.sav'
pickle.dump(model, open(filename, 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))
pickle.dump(accuracy * 100, open('job_model_accuracy.pkl', 'wb'))

print("✅ Model saved as dsJobPrediction_model.sav")

✅ Model saved as dsJobPrediction_model.sav


In [11]:
# loading the saved model
loaded_model = pickle.load(open('dsJobPrediction_model.sav', 'rb'))

In [12]:
input_data = {
    "city": "city_103",                  # developed city
    "city_development_index": 0.92,      # high development index
    "gender": "Male",
    "relevent_experience": "Has relevent experience",
    "enrolled_university": "no_enrollment",
    "education_level": "Masters",
    "major_discipline": "STEM",
    "experience": "7",                   # mid-level stable
    "company_size": "1000-4999",         # large company
    "company_type": "Pvt Ltd",
    "last_new_job": "1",                 # changed job recently
    "training_hours": 50       
}

# Convert to DataFrame (CatBoost needs same format as training data)
input_df = pd.DataFrame([input_data])

# Scale numeric columns same as training
input_df[num_cols] = scaler.transform(input_df[num_cols])

# Predict
prediction = loaded_model.predict(input_df)

print("🔮 Prediction:", prediction[0])

if prediction[0] == 0:
    print("❌ The candidate is unlikely to change jobs.")
else:
    print("✅ The candidate is likely to change jobs.")

🔮 Prediction: 0
❌ The candidate is unlikely to change jobs.


In [13]:
for column in X.columns:
  print(column)

city
city_development_index
gender
relevent_experience
enrolled_university
education_level
major_discipline
experience
company_size
company_type
last_new_job
training_hours


In [15]:
input_data = {
    "city": "city_21",                   # less developed city
    "city_development_index": 0.45,      # low development index
    "gender": "Female",
    "relevent_experience": "No relevent experience",
    "enrolled_university": "Full time course",
    "education_level": "Graduate",
    "major_discipline": "Other",
    "experience": "2",                   # very low experience
    "company_size": "<10",               # very small company
    "company_type": "Startup",
    "last_new_job": ">4",                # stuck for years
    "training_hours": 8 
}

# Convert to DataFrame (CatBoost needs same format as training data)
input_df = pd.DataFrame([input_data])

# Scale numeric columns same as training
input_df[num_cols] = scaler.transform(input_df[num_cols])

# Predict
prediction = loaded_model.predict(input_df)

print("🔮 Prediction:", prediction[0])

if prediction[0] == 0:
    print("❌ The candidate is unlikely to change jobs.")
else:
    print("✅ The candidate is likely to change jobs.")

🔮 Prediction: 1
✅ The candidate is likely to change jobs.
