In [None]:
################################################################ HR-Employee-Attrition ################################################################################

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pavansubhasht/ibm-hr-analytics-attrition-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'ibm-hr-analytics-attrition-dataset' dataset.
Path to dataset files: /kaggle/input/ibm-hr-analytics-attrition-dataset


In [6]:
import pandas as pd

# Load DataFrame
df = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Loading and Preprocessing ---
try:
    df = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
except FileNotFoundError:
    print("Error: 'WA_Fn-UseC_-HR-Employee-Attrition.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Drop irrelevant columns
columns_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
df = df.drop(columns=columns_to_drop)

# Target variable encoding
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Identify categorical and numerical features
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features.remove('Attrition')

# Create a preprocessor for one-hot encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

# Separate features and target variable
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# ---  Model Building and Evaluation (Default Parameters) ---
print("---  Model Building and Evaluation (Default Parameters) ---")
print("---------------------------------------------------------------")

# Initialize a dictionary to store default model performance results
results_default = {}

# List of classifiers to apply
classifiers = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0)
}

for name, classifier in classifiers.items():
    print(f"Training and Evaluating {name}...")
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    results_default[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }

    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print("\n  Confusion Matrix:\n", cm)
    print("-" * 50)


# ---  Model Optimization (Hyperparameter Tuning) ---
print("\n--- Model Optimization (Hyperparameter Tuning) ---")
print("-------------------------------------------------------")

results_tuned = {}

# Hyperparameter tuning for Random Forest
print("Tuning Random Forest...")
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))])
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test)
results_tuned['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf_tuned),
    'Precision': precision_score(y_test, y_pred_rf_tuned, zero_division=0),
    'Recall': recall_score(y_test, y_pred_rf_tuned, zero_division=0),
    'F1-score': f1_score(y_test, y_pred_rf_tuned, zero_division=0)
}
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Tuned Random Forest F1-score: {results_tuned['Random Forest']['F1-score']:.4f}")
print("-" * 50)

# Hyperparameter tuning for XGBoost
print("Tuning XGBoost...")
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))])
param_grid_xgb = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 5]
}
grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb_tuned = best_xgb_model.predict(X_test)
results_tuned['XGBoost'] = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb_tuned),
    'Precision': precision_score(y_test, y_pred_xgb_tuned, zero_division=0),
    'Recall': recall_score(y_test, y_pred_xgb_tuned, zero_division=0),
    'F1-score': f1_score(y_test, y_pred_xgb_tuned, zero_division=0)
}
print(f"Best parameters for XGBoost: {grid_search_xgb.best_params_}")
print(f"Tuned XGBoost F1-score: {results_tuned['XGBoost']['F1-score']:.4f}")
print("-" * 50)


# ---  Model Evaluation and Comparison ---
print("\n---  Model Evaluation and Comparison ---")
print("-------------------------------------------")

# Combine results into a summary table
summary_df = pd.DataFrame(results_default).T
summary_df_tuned = pd.DataFrame(results_tuned).T
summary_df = pd.concat([summary_df, summary_df_tuned], keys=['Default', 'Tuned'])
print("--- Summary of Model Performance ---")
print(summary_df.round(4))

# Identify the best-performing model based on F1-score
best_model_name = summary_df['Accuracy'].idxmax()
print(f"\nBest performing model based on Accuracy : {best_model_name}")
print("-" * 50)



---  Model Building and Evaluation (Default Parameters) ---
---------------------------------------------------------------
Training and Evaluating Decision Tree...
  Accuracy: 0.7721
  Precision: 0.3214
  Recall: 0.3830
  F1-score: 0.3495

  Confusion Matrix:
 [[209  38]
 [ 29  18]]
--------------------------------------------------
Training and Evaluating Random Forest...
  Accuracy: 0.8333
  Precision: 0.4286
  Recall: 0.1277
  F1-score: 0.1967

  Confusion Matrix:
 [[239   8]
 [ 41   6]]
--------------------------------------------------
Training and Evaluating AdaBoost...
  Accuracy: 0.8299
  Precision: 0.4400
  Recall: 0.2340
  F1-score: 0.3056

  Confusion Matrix:
 [[233  14]
 [ 36  11]]
--------------------------------------------------
Training and Evaluating XGBoost...
  Accuracy: 0.8571
  Precision: 0.6316
  Recall: 0.2553
  F1-score: 0.3636

  Confusion Matrix:
 [[240   7]
 [ 35  12]]
--------------------------------------------------
Training and Evaluating CatBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Accuracy: 0.8537
  Precision: 0.6667
  Recall: 0.1702
  F1-score: 0.2712

  Confusion Matrix:
 [[243   4]
 [ 39   8]]
--------------------------------------------------

--- Model Optimization (Hyperparameter Tuning) ---
-------------------------------------------------------
Tuning Random Forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Tuned Random Forest F1-score: 0.2623
--------------------------------------------------
Tuning XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters for XGBoost: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Tuned XGBoost F1-score: 0.3000
--------------------------------------------------

---  Model Evaluation and Comparison ---
-------------------------------------------
--- Summary of Model Performance ---
                       Accuracy  Precision  Recall  F1-score
Default Decision Tree    0.7721     0.3214  0.3830    0.3495
        Random Forest    0.8333     0.4286  0.1277    0.1967
        AdaBoost         0.8299     0.4400  0.2340    0.3056
        XGBoost          0.8571     0.6316  0.2553    0.3636
        CatBoost         0.8537     0.6667  0.1702    0.2712
Tuned   Random Forest    0.8469     0.5714  0.1702    0.2623
        XGBoost          0.8571     0.6923  0.1915    0.3000

Best performing model based on Accuracy : ('Default', 'XGBoost')
--------------------------------------------------
