In [None]:
!pip install catboost shap lime matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score, f1_score,
                             roc_auc_score, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import LabelEncoder
import shap
import lime
import lime.lime_tabular


# Load the CSV exported from Microsoft Forms
try:
  df_raw = pd.read_csv('survey_data.csv')
except FileNotFoundError:
  raise FileNotFoundError("CSV file not found")


# Mapping survey questions to technical feature names
column_mapping = {
    'Primary Department Question': 'Department',
    'Average Monthly Overtime': 'Overtime',
    'How many years has it been since your last job title change or promotion?': 'Promotion_Gap',
    'Satisfaction': 'Job_Satisfaction',
    'Risk': 'AI_Automation_Risk',
    'Has your specific department experienced staff layoffs or "firing" in the last 12 months?': 'Recent_Layoffs',
    'Security': 'Job_Security',
    'If you left today, how easy would it be to find a similar role elsewhere?': 'Market_Demand',
    'Are you actively planning to leave your current company or looking for a new job within the next 6 months?': 'Attrition'
}

df = df_raw.rename(columns=column_mapping)
df = df[list(column_mapping.values())]

# Data Cleaning
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
df = df.fillna('Neutral') # Handling missing values

# Define categorical columns for the CatBoost algorithm
cat_features = ['Department', 'Overtime', 'Job_Satisfaction', 'AI_Automation_Risk',
                'Recent_Layoffs', 'Job_Security', 'Market_Demand']


# Train, Validation and Test Split
X = df.drop('Attrition', axis=1)
y = df['Attrition']
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp)

print(f"Dataset Split Complete: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

# Hyperparameter Choices 
model = CatBoostClassifier(
    iterations=1000,        
    learning_rate=0.03,     
    depth=6,                  
    eval_metric='F1',        
    early_stopping_rounds=50, 
    random_seed=42,
    verbose=100            
)

# Training
model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("PERFORMANCE RESULTS")
print(classification_report(y_test, y_pred))
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-Score:         {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score:    {roc_auc_score(y_test, y_prob):.4f}")

# Plot 1: Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Plot 2: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc(fpr, tpr):.2f})')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()


# SHAP
print("\n SHAP")
explainer_shap = shap.TreeExplainer(model)
shap_values = explainer_shap.shap_values(X_test)
shap.summary_plot(shap_values, X_test) 

# LIME
X_train_lime = X_train.copy()
categorical_names = {}
encoders = {}
for i, col in enumerate(X.columns):
    if col in cat_features:
        le = LabelEncoder()
        X_train_lime[col] = le.fit_transform(X_train[col].astype(str))
        encoders[col] = le
        categorical_names[i] = le.classes_.tolist()


# Turning LIME numbers back into strings for CatBoost
def predict_fn(np_array):
    temp_df = pd.DataFrame(np_array, columns=X.columns)
    for col in cat_features:
        temp_df[col] = encoders[col].inverse_transform(temp_df[col].astype(int))
    return model.predict_proba(temp_df)

explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train_lime.values,
    feature_names=X.columns.tolist(),
    categorical_features=[X.columns.get_loc(c) for c in cat_features],
    categorical_names=categorical_names,
    class_names=['Stay', 'Leave'],
    mode='classification'
)

idx = 0
inst_encoded = X_test.iloc[idx].copy()
for col in cat_features:
    inst_encoded[col] = encoders[col].transform([inst_encoded[col]])[0]

exp = explainer_lime.explain_instance(inst_encoded.values.astype(float), predict_fn, num_features=5)
print(f"\nLIME Explanation for Case #{idx}:")
exp.show_in_notebook(show_table=True)