
# Day 4 â€“ Employee Attrition Dataset Preprocessing

This notebook covers:

1. Categorical Encoding (Label Encoding & One-Hot Encoding)
2. Feature Scaling (Min-Max & Standardization)
3. Feature Engineering
4. Train-Test Split
5. Target Variable Separation
6. Mini Preprocessing Pipeline
7. Notes


In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("cleaned_employee_attrition.csv")

df.head()


In [None]:

# Convert column names to lowercase
df.columns = df.columns.str.lower()

df_processed = df.copy()

# Label Encoding (Binary Columns)
label_cols = ["gender", "everbenched"]
le = LabelEncoder()

for col in label_cols:
    df_processed[col] = le.fit_transform(df_processed[col])

# One-Hot Encoding (Multi-category Columns)
df_processed = pd.get_dummies(df_processed, columns=["city", "education"], drop_first=True)

df_processed.head()


In [None]:

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_processed["age_minmax"] = minmax_scaler.fit_transform(df_processed[["age"]])
df_processed["experience_minmax"] = minmax_scaler.fit_transform(df_processed[["experienceincurrentdomain"]])

# Standardization
standard_scaler = StandardScaler()
df_processed["age_standardized"] = standard_scaler.fit_transform(df_processed[["age"]])
df_processed["experience_standardized"] = standard_scaler.fit_transform(df_processed[["experienceincurrentdomain"]])

df_processed.head()


In [None]:

# Age Groups
def age_group(age):
    if age < 30:
        return "Young"
    elif age < 40:
        return "Mid"
    else:
        return "Senior"

df_processed["agegroup"] = df["age"].apply(age_group)

# Experience Buckets
def exp_bucket(exp):
    if exp < 3:
        return "Junior"
    elif exp < 7:
        return "MidLevel"
    else:
        return "SeniorLevel"

df_processed["experiencebucket"] = df["experienceincurrentdomain"].apply(exp_bucket)

# Years in Company
current_year = 2024
df_processed["yearsincompany"] = current_year - df["joiningyear"]

df_processed.head()


In [None]:

# Separate Features and Target
X = df_processed.drop("leaveornot", axis=1)
y = df_processed["leaveornot"]

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


In [None]:

# Mini Preprocessing Pipeline

df_pipeline = df.copy()

# 1. Handle Missing Values
df_pipeline = df_pipeline.fillna(df_pipeline.median(numeric_only=True))

# 2. Encode Categorical
df_pipeline["gender"] = LabelEncoder().fit_transform(df_pipeline["gender"])
df_pipeline["everbenched"] = LabelEncoder().fit_transform(df_pipeline["everbenched"])
df_pipeline = pd.get_dummies(df_pipeline, columns=["city", "education"], drop_first=True)

# 3. Scale Numerical
scaler = StandardScaler()
num_cols = ["age", "experienceincurrentdomain"]
df_pipeline[num_cols] = scaler.fit_transform(df_pipeline[num_cols])

# 4. Split
X_pipeline = df_pipeline.drop("leaveornot", axis=1)
y_pipeline = df_pipeline["leaveornot"]

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_pipeline, y_pipeline, test_size=0.2, random_state=42
)

print("Pipeline Train Shape:", X_train_p.shape)
print("Pipeline Test Shape:", X_test_p.shape)



# Notes

## What is Feature Engineering?
Feature engineering is the process of creating new meaningful features 
from existing variables to improve model performance.

## Why Preprocessing is Important?
- Converts categorical data into numeric form
- Handles missing values
- Scales features for better model convergence
- Improves model performance and stability

## What Confused Me Today?
- Label Encoding vs One-Hot Encoding usage
- MinMaxScaler vs StandardScaler difference
- Proper order of scaling and splitting
