### **Machine Learning Project - Group AR**

In [38]:
# 1) Core imports
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")


# Load dataset from GitHub commit permalink (online, reproducible)
DATA_URL = "https://raw.githubusercontent.com/Behnia02/Group_AR/631a8d5f9337e2a4b9351ffb13ca583d28459dc1/HR-Employee-Attrition.csv"

df = pd.read_csv(DATA_URL)

print(f"Dataset loaded online from: {DATA_URL}")
print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
display(df.head())

Dataset loaded online from: https://raw.githubusercontent.com/Behnia02/Group_AR/631a8d5f9337e2a4b9351ffb13ca583d28459dc1/HR-Employee-Attrition.csv
Shape: 1470 rows x 35 columns


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


#### **1. Data Cleaning**

In [39]:
# Build summary table
summary_table = pd.DataFrame({
    "Total Values": df.count(),
    "Null Values": df.isna().sum(),
    "Unique Values": df.nunique()})

# Display the summary table
summary_table.head(len(summary_table))

Unnamed: 0,Total Values,Null Values,Unique Values
Age,1470,0,43
Attrition,1470,0,2
BusinessTravel,1470,0,3
DailyRate,1470,0,886
Department,1470,0,3
DistanceFromHome,1470,0,29
Education,1470,0,5
EducationField,1470,0,6
EmployeeCount,1470,0,1
EmployeeNumber,1470,0,1470


From this table, we can conclude that there is no missing data in the dataset, since all variables have complete records across the 1470 observations.

**Handling redundant data**

In [40]:
#Look for duplicates
print("Duplicates:", df.duplicated().sum())

Duplicates: 0


In [41]:
# Redundant columns - variables with only one unique value

one_unique_vars = summary_table[summary_table["Unique Values"] == 1].index.tolist()
print(one_unique_vars)


['EmployeeCount', 'Over18', 'StandardHours']


In [42]:
# Drop the columns with just one unique value
df = df.drop(columns=["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], errors="ignore")

**Convenience encoding**

In [43]:
#Printing out binary variables
import pandas as pd

# --- Identify binary variables (include Over18, exclude PerformanceRating) ---
binary_vars = df[[col for col in df.columns
                  if (df[col].nunique() == 2 and col != "PerformanceRating")
                  ]]

# --- Pretty display of the first rows ---
binary_vars.head().style.set_properties(**{
    'border': '1px solid black',
    'padding': '4px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#d3d3d3'),
              ('color', 'black'),
              ('font-weight', 'bold'),
              ('text-align', 'center')]
}])

Unnamed: 0,Attrition,Gender,OverTime
0,Yes,Female,Yes
1,No,Male,No
2,Yes,Male,Yes
3,No,Female,Yes
4,No,Male,No


**Transforming Binary Variables**

In [44]:


#Turning binary variables to numeric
# --- Copy dataset to avoid modifying original ---
df = df.copy()

# --- Explicit binary mappings ---
df["Attrition_Flag"] = df["Attrition"].map({"Yes": 1, "No": 0})
df["OverTime_Flag"]  = df["OverTime"].map({"Yes": 1, "No": 0})
df["Gender_Flag"]    = df["Gender"].map({"Male": 1, "Female": 0})


# --- Styled output preview ---
df_preview = df[[
    "Attrition", "Attrition_Flag",
    "OverTime", "OverTime_Flag",
    "Gender", "Gender_Flag",

]].head()

df_preview.style.set_properties(**{
    'border': '1px solid black',
    'padding': '4px'
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#d3d3d3'),
              ('color', 'black'),
              ('font-weight', 'bold'),
              ('text-align', 'center')]
}])


Unnamed: 0,Attrition,Attrition_Flag,OverTime,OverTime_Flag,Gender,Gender_Flag
0,Yes,1,Yes,1,Female,0
1,No,0,No,0,Male,1
2,Yes,1,Yes,1,Male,1
3,No,0,Yes,1,Female,0
4,No,0,No,0,Male,1


**Missing Value Check**

In [45]:

#Checking for missing values
if df.isnull().values.any():
    print("There are missing values in the dataset.")
else:
    print("No missing values found")


No missing values found
