# 00 — End-to-End Pipeline (Wrangling → Preprocessing → Modeling → Report)
**Project:** Employee Attrition Prediction  
**Author:** Bini  
**Date:** 2025-08-27

This single notebook is a *backup* that runs the full pipeline end-to-end:
1) Load raw data, clean, and EDA  
2) Preprocess & baseline training  
3) Advanced models, evaluation, and a short report  


In [None]:
# Imports (keep at top; add new ones in new cells)
import warnings, json
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, RocCurveDisplay)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

PROJ_DIR = Path.cwd()
DATA_DIR = PROJ_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
CLEAN_DIR = DATA_DIR / "clean"
MODEL_DIR = PROJ_DIR / "models"
for d in [DATA_DIR, RAW_DIR, CLEAN_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

RAW_FILE = RAW_DIR / "employee_attrition_raw.csv"
CLEAN_FILE = CLEAN_DIR / "cleaned.csv"

print('Dirs ready:', DATA_DIR, RAW_DIR, CLEAN_DIR, MODEL_DIR)

In [None]:
# 1) Load Raw, Clean, Simple EDA
if not RAW_FILE.exists():
    raise FileNotFoundError(f'Raw dataset not found at {RAW_FILE}. Place your CSV there.')

df = pd.read_csv(RAW_FILE)
print('Raw shape:', df.shape)

# Column cleanup
df.columns = [c.strip().replace(' ','_').lower() for c in df.columns]

# Basic info
print(df.dtypes.head(10))
print('\nMissing values (top 10):\n', df.isna().sum().sort_values(ascending=False).head(10))

# Imputation (simple)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

for c in cat_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna('Unknown')

# Optional: quick feature engineering examples (commented)
# if 'age' in df.columns:
#     import pandas as pd
#     df['age_bucket'] = pd.cut(df['age'], bins=[0,25,35,45,55,100], labels=False, include_lowest=True)

# Quick EDA plots
target = 'attrition' if 'attrition' in df.columns else None
if target:
    # Numeric dist (first 3)
    for c in num_cols[:3]:
        plt.figure()
        df[c].hist(bins=30)
        plt.title(f'Distribution: {c}')
        plt.xlabel(c); plt.ylabel('Count')
        plt.show()

    # Cat vs target (first 3)
    top_cats = [c for c in cat_cols if c != target][:3]
    for c in top_cats:
        plt.figure()
        (df.groupby(c)[target].value_counts(normalize=True).unstack().fillna(0)).plot(kind='bar')
        plt.title(f'{c} vs {target} (proportion)')
        plt.xlabel(c); plt.ylabel('Proportion')
        plt.show()

# Save cleaned
df.to_csv(CLEAN_FILE, index=False)
print('Saved cleaned ->', CLEAN_FILE)

In [None]:
# 2) Preprocessing + Baselines
TARGET = 'attrition' if 'attrition' in df.columns else None
if TARGET is None:
    raise ValueError("Target column 'attrition' not found. Update TARGET accordingly.")

y = (df[TARGET].astype(str).str.lower().isin(['yes','1','true'])).astype(int)
X = df.drop(columns=[TARGET], errors='ignore')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Split:', X_train.shape, X_test.shape)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(with_mean=False), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

logreg = Pipeline([('prep', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
tree   = Pipeline([('prep', preprocessor), ('clf', DecisionTreeClassifier(random_state=42))])

logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)

from sklearn.metrics import classification_report
print('\nLogReg report:\n', classification_report(y_test, logreg.predict(X_test)))
print('\nDecisionTree report:\n', classification_report(y_test, tree.predict(X_test)))

In [None]:
# 3) Advanced Models + Evaluation + Leaderboard
rf = Pipeline([('prep', preprocessor), ('clf', RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))])
rf.fit(X_train, y_train)

def eval_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    proba = getattr(model, 'predict_proba', None)
    y_proba = model.predict_proba(X_test)[:,1] if proba else None
    m = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
    }
    print(f"\n{name} metrics: {m}")
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:\n', cm)
    if y_proba is not None:
        RocCurveDisplay.from_predictions(y_test, y_proba); plt.title(f'ROC — {name}'); plt.show()
    return m

results = {}
results['Baseline_LogReg'] = eval_model('Baseline_LogReg', logreg, X_test, y_test)
results['DecisionTree'] = eval_model('DecisionTree', tree, X_test, y_test)
results['RandomForest'] = eval_model('RandomForest', rf, X_test, y_test)

import pandas as pd
res_df = pd.DataFrame(results).T.sort_values('f1', ascending=False)
print('\n=== Leaderboard ===')
print(res_df.round(4))

In [None]:
# 4) Mini Report (print-only, copy to README or LinkedIn)
from textwrap import dedent
report = dedent(f"""
## Problem Statement
Predict which employees are at risk of attrition so leaders can act earlier (retention offers, manager coaching).

## Key EDA Insights
- Example: Overtime and lower JobSatisfaction showed higher attrition rates.
- Example: Early tenure (0–2 years) had higher churn (onboarding/fit).

## Modeling Approach
- Baselines: Logistic Regression, Decision Tree with OHE + StandardScaler.
- Advanced: RandomForest for stronger non-linear signal capture.

## Results (Test Set)
{res_df.round(4).to_markdown(index=True) if 'res_df' in globals() else '- Run above cells first -'}

## Business Value
- Reducing unwanted exits can save 50–200% of salary per replacement and stabilize teams.
- Clear drivers help target interventions (overtime, satisfaction, tenure).

## Next Steps
- Tune thresholds for desired precision/recall trade-offs.
- Add SHAP for explainability & a Streamlit dashboard for HR partners.
""")
print(report)