### **Machine Learning Project - Group AR**

In [89]:
# 1) Core imports
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")


# Load dataset from GitHub commit permalink (online, reproducible)
DATA_URL = "https://raw.githubusercontent.com/Behnia02/Group_AR/631a8d5f9337e2a4b9351ffb13ca583d28459dc1/HR-Employee-Attrition.csv"

df = pd.read_csv(DATA_URL)

print(f"Dataset loaded online from: {DATA_URL}")
print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
display(df.head())

Dataset loaded online from: https://raw.githubusercontent.com/Behnia02/Group_AR/631a8d5f9337e2a4b9351ffb13ca583d28459dc1/HR-Employee-Attrition.csv
Shape: 1470 rows x 35 columns


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


#### **1. Data Cleaning**

In [90]:
# Build summary table
summary_table = pd.DataFrame({
    "Total Values": df.count(),
    "Null Values": df.isna().sum(),
    "Unique Values": df.nunique()})

# Display the summary table
summary_table.head(len(summary_table))

Unnamed: 0,Total Values,Null Values,Unique Values
Age,1470,0,43
Attrition,1470,0,2
BusinessTravel,1470,0,3
DailyRate,1470,0,886
Department,1470,0,3
DistanceFromHome,1470,0,29
Education,1470,0,5
EducationField,1470,0,6
EmployeeCount,1470,0,1
EmployeeNumber,1470,0,1470


From this table, we can conclude that there is no missing data in the dataset, since all variables have complete records across the 1470 observations.

**Handling redundant data**

In [91]:
#Look for duplicates
print("Duplicates:", df.duplicated().sum())

Duplicates: 0


In [92]:
# Redundant columns - variables with only one unique value

one_unique_vars = summary_table[summary_table["Unique Values"] == 1].index.tolist()
print(one_unique_vars)


['EmployeeCount', 'Over18', 'StandardHours']


In [93]:
# Drop the columns with just one unique value and EmployeeNumber (ID variable)
df = df.drop(columns=["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], errors="ignore")

**Convenience encoding**

In [94]:
#Printing out binary variables
import pandas as pd

# --- Identify binary variables (include Over18, exclude PerformanceRating) ---
binary_vars = df[[col for col in df.columns
                  if (df[col].nunique() == 2 and col != "PerformanceRating")
                  ]]

# --- Pretty display of the first rows ---
binary_vars.head().style.set_properties(**{
    'border': '1px solid black',
    'padding': '4px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#d3d3d3'),
              ('color', 'black'),
              ('font-weight', 'bold'),
              ('text-align', 'center')]
}])

Unnamed: 0,Attrition,Gender,OverTime
0,Yes,Female,Yes
1,No,Male,No
2,Yes,Male,Yes
3,No,Female,Yes
4,No,Male,No


**Transforming Binary Variables**

In [95]:


#Turning binary variables to numeric
# --- Copy dataset to avoid modifying original ---
df = df.copy()

# --- Explicit binary mappings ---
df["Attrition_Flag"] = df["Attrition"].map({"Yes": 1, "No": 0})
df["OverTime_Flag"]  = df["OverTime"].map({"Yes": 1, "No": 0})
df["Gender_Flag"]    = df["Gender"].map({"Male": 1, "Female": 0})


# --- Styled output preview ---
df_preview = df[[
    "Attrition", "Attrition_Flag",
    "OverTime", "OverTime_Flag",
    "Gender", "Gender_Flag",

]].head()

df_preview.style.set_properties(**{
    'border': '1px solid black',
    'padding': '4px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#d3d3d3'),
              ('color', 'black'),
              ('font-weight', 'bold'),
              ('text-align', 'center')]
}])


Unnamed: 0,Attrition,Attrition_Flag,OverTime,OverTime_Flag,Gender,Gender_Flag
0,Yes,1,Yes,1,Female,0
1,No,0,No,0,Male,1
2,Yes,1,Yes,1,Male,1
3,No,0,Yes,1,Female,0
4,No,0,No,0,Male,1


In [96]:
#Drop original categorical columns after encoding
df = df.drop(columns=["Attrition", "OverTime", "Gender"])

**Missing Value Check**

In [97]:

#Checking for missing values
if df.isnull().values.any():
    print("There are missing values in the dataset.")
else:
    print("No missing values found")


No missing values found


In [98]:
#Sanity check
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition_Flag,OverTime_Flag,Gender_Flag
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,94,3,2,Sales Executive,4,Single,5993,19479,8,11,3,1,0,8,0,1,6,4,0,5,1,1,0
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,61,2,2,Research Scientist,2,Married,5130,24907,1,23,4,4,1,10,3,3,10,7,1,7,0,0,1
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,92,2,1,Laboratory Technician,3,Single,2090,2396,6,15,3,2,0,7,3,3,0,0,0,0,1,1,1
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,56,3,1,Research Scientist,3,Married,2909,23159,1,11,3,3,0,8,3,3,8,7,3,0,0,1,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,40,3,1,Laboratory Technician,2,Married,3468,16632,9,12,3,4,1,6,3,3,2,2,2,2,0,0,1


**Classification of the variables**

*Classification by type*

In [99]:
# TARGET
target = "Attrition_Flag"

# BINARY VARIABLES (Mapped)
binary_vars = [
    "Attrition_Flag",   # Target
    "OverTime_Flag",
    "Gender_Flag"
]

# CATEGORICAL VARIABLES (Nominal)
categorical_vars = [
    "BusinessTravel",
    "Department",
    "EducationField",
    "JobRole",
    "MaritalStatus"
]

# ORDINAL / DISCRETE NUMERIC
ordinal_vars = [
    "Education",
    "EnvironmentSatisfaction",
    "JobInvolvement",
    "JobLevel",
    "JobSatisfaction",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "StockOptionLevel",
    "TrainingTimesLastYear",
    "WorkLifeBalance"
]

# CONTINUOUS NUMERIC
continuous_vars = [
    "Age",
    "DailyRate",
    "DistanceFromHome",
    "HourlyRate",
    "MonthlyIncome",
    "MonthlyRate",
    "NumCompaniesWorked",
    "PercentSalaryHike",
    "TotalWorkingYears",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager"
]

# FINAL FEATURE LIST (FOR MODELING)

feature_vars = (
    binary_vars[1:] +   # exclude target
    categorical_vars +
    ordinal_vars +
    continuous_vars
)

*Classification by clusters*

In [100]:
# TARGET
target = "Attrition_Flag"


# Demographics
demographic_vars = [
    "Age",
    "Gender_Flag",
    "MaritalStatus",
    "Education",
    "EducationField",
    "DistanceFromHome"
]


# Compensation & Financial Incentives
compensation_vars = [
    "MonthlyIncome",
    "DailyRate",
    "HourlyRate",
    "MonthlyRate",
    "PercentSalaryHike",
    "StockOptionLevel",
    "JobLevel"
]


# Job Structure & Role Characteristics
job_structure_vars = [
    "Department",
    "JobRole",
    "BusinessTravel",
    "OverTime_Flag",
    "JobInvolvement"
]


# Career Progression & Tenure
career_progression_vars = [
    "TotalWorkingYears",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager",
    "NumCompaniesWorked",
    "TrainingTimesLastYear"
]


# Workplace Satisfaction & Environment
satisfaction_vars = [
    "JobSatisfaction",
    "EnvironmentSatisfaction",
    "RelationshipSatisfaction",
    "WorkLifeBalance",
    "PerformanceRating"
]



In [101]:
### TO DELETE


#Sanity Check: Check for duplicates in feature list
len(feature_vars), len(set(feature_vars))


(30, 30)

**Metatable of the included variables**

In [102]:
# Metadata table based on current df only

def infer_var_type(series):
    if pd.api.types.is_numeric_dtype(series):
        if series.nunique() == 2:
            return "Binary"
        if series.nunique() <= 10:
            return "Ordinal"
        return "Numerical"
    return "Nominal"

# Optional short descriptions for common columns (only used if present in df)
descriptions = {
    "Age": "Employee age in years",
    "Attrition_Flag": "Binary target: attrition",
    "BusinessTravel": "Frequency of business travel",
    "DailyRate": "Daily rate of employee",
    "Department": "Department of the employee",
    "DistanceFromHome": "Distance from home to office",
    "Education": "Education level (1-5)",
    "EducationField": "Field of education",
    "EnvironmentSatisfaction": "Satisfaction with work environment (1-4)",
    "Gender_Flag": "Binary gender flag",
    "HourlyRate": "Hourly pay rate",
    "JobInvolvement": "Job involvement (1-4)",
    "JobLevel": "Job hierarchy level",
    "JobRole": "Role or position in company",
    "JobSatisfaction": "Job satisfaction (1-4)",
    "MaritalStatus": "Marital status of employee",
    "MonthlyIncome": "Monthly salary",
    "MonthlyRate": "Monthly rate for payroll purposes",
    "NumCompaniesWorked": "Number of previous companies worked",
    "OverTime_Flag": "Binary overtime flag",
    "PercentSalaryHike": "Percentage salary increase",
    "PerformanceRating": "Performance rating (1-4)",
    "RelationshipSatisfaction": "Relationship satisfaction (1-4)",
    "StockOptionLevel": "Stock option level",
    "TotalWorkingYears": "Total years of work experience",
    "TrainingTimesLastYear": "Number of trainings attended last year",
    "WorkLifeBalance": "Work-life balance rating (1-4)",
    "YearsAtCompany": "Years spent at current company",
    "YearsInCurrentRole": "Years spent in current role",
    "YearsSinceLastPromotion": "Years since last promotion",
    "YearsWithCurrManager": "Years working with current manager"
}

# Build metadata table for all columns in df
metadata_df = pd.DataFrame([
    [col, str(df[col].dtype), infer_var_type(df[col]), descriptions.get(col, "-")]
    for col in df.columns
], columns=["Variable Name", "Data Type", "Variable Type", "Description"])

metadata_styled = (
    metadata_df.style
    .set_properties(**{'border': '1px solid black', 'padding': '4px'})
    .set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#d3d3d3'),
                  ('color', 'black'),
                  ('font-weight', 'bold'),
                  ('text-align', 'center')]
    }])
    .hide(axis="index")
)

metadata_styled

Variable Name,Data Type,Variable Type,Description
Age,int64,Numerical,Employee age in years
BusinessTravel,str,Nominal,Frequency of business travel
DailyRate,int64,Numerical,Daily rate of employee
Department,str,Nominal,Department of the employee
DistanceFromHome,int64,Numerical,Distance from home to office
Education,int64,Ordinal,Education level (1-5)
EducationField,str,Nominal,Field of education
EnvironmentSatisfaction,int64,Ordinal,Satisfaction with work environment (1-4)
HourlyRate,int64,Numerical,Hourly pay rate
JobInvolvement,int64,Ordinal,Job involvement (1-4)


#### **2. EDA**

**Descriptive Statistics**

In [None]:
# Descriptive statistics for numeric variables
num_cols = df.select_dtypes(include="number").columns.tolist()

numeric_summary = df[num_cols].describe().T
numeric_summary["missing"] = df[num_cols].isna().sum()
numeric_summary["missing_pct"] = (numeric_summary["missing"] / len(df) * 100).round(2)

numeric_summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,missing_pct
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0,0,0.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0,0,0.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0,0,0.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0,0,0.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0,0,0.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0,0,0.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0,0,0.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0,0,0.0
JobSatisfaction,1470.0,2.728571,1.102846,1.0,2.0,3.0,4.0,4.0,0,0.0
MonthlyIncome,1470.0,6502.931293,4707.956783,1009.0,2911.0,4919.0,8379.0,19999.0,0,0.0


**Categorical distributions**

In [105]:
# Frequency tables for categorical variables
cat_cols = df.select_dtypes(exclude="number").columns.tolist()

cat_summary = []
for col in cat_cols:
    counts = df[col].value_counts(dropna=False)
    top = counts.head(10)
    for level, count in top.items():
        cat_summary.append({
            "Variable": col,
            "Level": str(level),
            "Count": int(count),
            "Percent": round(count / len(df) * 100, 2)
        })

cat_summary_df = pd.DataFrame(cat_summary)
cat_summary_df

Unnamed: 0,Variable,Level,Count,Percent
0,BusinessTravel,Travel_Rarely,1043,70.95
1,BusinessTravel,Travel_Frequently,277,18.84
2,BusinessTravel,Non-Travel,150,10.2
3,Department,Research & Development,961,65.37
4,Department,Sales,446,30.34
5,Department,Human Resources,63,4.29
6,EducationField,Life Sciences,606,41.22
7,EducationField,Medical,464,31.56
8,EducationField,Marketing,159,10.82
9,EducationField,Technical Degree,132,8.98
