# Attrition Mini Project — Combined Notebook
Contains Wrangling + EDA, Preprocessing + Training, and Modeling + Results.
Use this if you prefer a single file workflow.


# Imports
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report,
                             RocCurveDisplay)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

PROJ_DIR = Path.cwd()
DATA_DIR = PROJ_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
CLEAN_DIR = DATA_DIR / "clean"
MODEL_DIR = PROJ_DIR / "models"

for d in [DATA_DIR, RAW_DIR, CLEAN_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print('Dirs ready:', DATA_DIR)


# Step 1 — Wrangling & EDA
RAW_FILE = RAW_DIR / "employee_attrition_raw.csv"
CLEAN_FILE = CLEAN_DIR / "cleaned.csv"

if not RAW_FILE.exists():
    print("⚠️ Add your dataset to:", RAW_FILE)
else:
    df = pd.read_csv(RAW_FILE)
    print("Raw shape:", df.shape)
    df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
    df = df.fillna({"column": "Unknown"})  # simple placeholder

    # Simple plot
    if "attrition" in df.columns:
        df['attrition'].value_counts().plot(kind="bar", title="Attrition Distribution")
        plt.show()

    df.to_csv(CLEAN_FILE, index=False)
    print("✅ Saved cleaned:", CLEAN_FILE)


# Step 2 — Preprocessing & Baseline
if CLEAN_FILE.exists():
    df = pd.read_csv(CLEAN_FILE)
    target = "attrition"
    y = (df[target].astype(str).str.lower().isin(["yes","1","true"])).astype(int)
    X = df.drop(columns=[target])

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

    num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

    preprocessor = ColumnTransformer([("num", StandardScaler(with_mean=False), num_cols),
                                      ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)])

    pipe = Pipeline([("prep", preprocessor), ("clf", LogisticRegression(max_iter=1000))])
    pipe.fit(X_train, y_train)
    print("Baseline report:\n", classification_report(y_test, pipe.predict(X_test)))


# Step 3 — Advanced Modeling
rf = Pipeline([("prep", preprocessor), ("clf", RandomForestClassifier(random_state=42))])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("RandomForest metrics:")
print(classification_report(y_test, y_pred))

RocCurveDisplay.from_estimator(rf, X_test, y_test)
plt.show()


# 📜 Mini Report
print("""
## Problem Statement
Predict attrition risk to support HR retention.

## EDA
- Target imbalance likely (attrition Yes vs No).
- Overtime, job satisfaction, and tenure impact attrition.

## Models
- Baseline: Logistic Regression.
- Advanced: RandomForest.

## Results
- Metrics printed above.

## Business Value
Early warning for HR → reduced replacement cost.

## Next Steps
Add XGBoost, CatBoost, SHAP explainability, and business cost analysis.
""")
