In [1]:
# ================================
# HR Attrition - Data Cleaning Script
# ================================

import pandas as pd
import numpy as np
import os

# 1. Load the dataset
file_path = r"C:\HR-Attrition-Analytics-project\WA_Fn-UseC_-HR-Employee-Attrition.csv"


df = pd.read_csv(file_path)

print("🔹 Original Data Shape:", df.shape)
print("🔹 Columns:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())

# 2. Basic dataset info
print("\nDataset Info:")
print(df.info())

print("\nMissing Values Count:")
print(df.isnull().sum())

# 3. Handle missing values
for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        # Fill numeric NaN with median
        df[col] = df[col].fillna(df[col].median())
    else:
        # Fill categorical NaN with mode
        df[col] = df[col].fillna(df[col].mode()[0])

# 4. Remove duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"\n✅ Removed {before - after} duplicate rows.")

# 5. Standardize column names (lowercase + underscores)
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace("-", "_")
)

# 6. Convert object columns to category for efficiency
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype("category")

# 7. Verify cleaning
print("\n🔹 Cleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum().sum())  # should be 0

# 8. Ensure processed folder exists
os.makedirs("data/processed", exist_ok=True)

# 9. Save cleaned dataset
df.to_csv("HR_Attrition_Cleaned.csv", index=False)

output_path = "data/processed/HR_Attrition_Cleaned.csv"
df.to_csv(output_path, index=False)

print("\n🎉 Data Cleaning Completed!")
print("✅ Cleaned dataset saved at:", output_path)
print("✅ Final Shape:", df.shape) 

df.to_csv("HR_Attrition_Cleaned_fixed.csv", index=False)




🔹 Original Data Shape: (1470, 35)
🔹 Columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

First 5 rows:
    Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel