### The ordinal categorical data is given as a numbers here, which can create a problem while reading from the UI, the Front End Engineer will need to know, which value corresponds to which integer. We better handle this ourselves here.

This will help us in Data Visualization in the Future as Well***

In [32]:
# Library for Data Manipulation
import numpy as np
import pandas as pd

# Library for Statistical Modelling
from sklearn.preprocessing import LabelEncoder

# Library for Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

%matplotlib inline


In [33]:
# Load the dataset into the dataframe
employee_data = pd.read_csv('../data/IBM-HR-Analytics-Employee-Attrition-and-Performance.csv')

employee_data['StockOptionLevel'].unique()

array([0, 1, 3, 2], dtype=int64)

Modifying the Catergorical Ordinal Columns

In [34]:
employee_data["PerformanceRating"].unique()

array([3, 4], dtype=int64)

In [26]:
employee_data["Education"] = employee_data["Education"].replace(
    {1: "Below College", 2: "College", 3: "Bachelor", 4: "Master", 5: "Doctor"}
)
employee_data["EnvironmentSatisfaction"] = employee_data[
    "EnvironmentSatisfaction"
].replace({1: "Low", 2: "Medium", 3: "High", 4: "Very High"})
employee_data["JobInvolvement"] = employee_data["JobInvolvement"].replace(
    {1: "Low", 2: "Medium", 3: "High", 4: "Very High"}
)
employee_data["JobLevel"] = employee_data["JobLevel"].replace(
    {
        1: "Entry Level",
        2: "Junior Level",
        3: "Mid Level",
        4: "Senior Level",
        5: "Executive Level",
    }
)
employee_data["JobSatisfaction"] = employee_data["JobSatisfaction"].replace(
    {1: "Low", 2: "Medium", 3: "High", 4: "Very High"}
)
employee_data["PerformanceRating"] = employee_data["PerformanceRating"].replace(
    {1: "Low", 2: "Good", 3: "Excellent", 4: "Outstanding"}
)
employee_data["RelationshipSatisfaction"] = employee_data[
    "RelationshipSatisfaction"
].replace({1: "Low", 2: "Medium", 3: "High", 4: "Very High"})
employee_data["WorkLifeBalance"] = employee_data["WorkLifeBalance"].replace(
    {1: "Bad", 2: "Good", 3: "Better", 4: "Best"}
)
employee_data["StockOptionLevel"] = employee_data["StockOptionLevel"].replace(
    {
        0: "No Stock Options",
        1: "Low Stock Options",
        2: "Medium Stock Options",
        3: "High Stock Options",
    }
)

In [27]:
employee_data[['StockOptionLevel']].head()

Unnamed: 0,StockOptionLevel
0,No Stock Options
1,Low Stock Options
2,No Stock Options
3,No Stock Options
4,Low Stock Options


There is another Ordinal Column -> "NumCompaniesWorked". Because it can take discrete number of values & has a natural order to it. But No need to change this.

In [28]:
employee_data.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
394,31,No,Travel_Rarely,480,Research & Development,7,College,Medical,1,524,Medium,Female,31,High,Junior Level,Manufacturing Director,Low,Married,4306,4156,1,Y,No,12,Excellent,Medium,80,Low Stock Options,13,5,Bad,13,10,3,12
435,33,Yes,Travel_Rarely,1277,Research & Development,15,Below College,Medical,1,582,Medium,Male,56,High,Mid Level,Manager,High,Married,13610,24619,7,Y,Yes,12,Excellent,Very High,80,No Stock Options,15,2,Best,7,6,7,7
907,44,No,Travel_Rarely,1099,Sales,5,Bachelor,Marketing,1,1267,Medium,Male,88,High,Executive Level,Manager,Medium,Married,18213,8751,7,Y,No,11,Excellent,High,80,Low Stock Options,26,5,Better,22,9,3,10
581,30,No,Travel_Rarely,921,Research & Development,1,Bachelor,Life Sciences,1,806,Very High,Male,38,Low,Entry Level,Laboratory Technician,High,Married,3833,24375,3,Y,No,21,Outstanding,High,80,Medium Stock Options,7,2,Better,2,2,0,2
1158,37,No,Travel_Rarely,671,Research & Development,19,Bachelor,Life Sciences,1,1631,High,Male,85,High,Junior Level,Manufacturing Director,High,Married,5768,26493,3,Y,No,17,Excellent,Low,80,High Stock Options,9,2,Good,4,3,0,2


Removing Un-necessary Columns from the Table

In [29]:
employee_data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18'], axis=1, inplace=True)
employee_data.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
172,36,No,Travel_Frequently,1480,Research & Development,3,College,Medical,Very High,Male,30,High,Entry Level,Laboratory Technician,Medium,Single,2088,15062,4,No,12,Excellent,High,80,No Stock Options,13,3,Good,8,7,7,2
18,53,No,Travel_Rarely,1219,Sales,2,Master,Life Sciences,Low,Female,78,Medium,Senior Level,Manager,Very High,Married,15427,22021,2,No,16,Excellent,High,80,No Stock Options,31,3,Better,25,8,3,7
343,29,No,Travel_Rarely,144,Sales,10,Below College,Marketing,Very High,Female,39,Medium,Junior Level,Sales Executive,Medium,Divorced,8268,11866,1,Yes,14,Excellent,Low,80,Medium Stock Options,7,2,Better,7,7,1,7
225,59,No,Travel_Rarely,142,Research & Development,3,Bachelor,Life Sciences,High,Male,70,Medium,Entry Level,Research Scientist,Very High,Married,2177,8456,3,No,17,Excellent,Low,80,Low Stock Options,7,6,Better,1,0,0,0
706,40,Yes,Non-Travel,1479,Sales,24,Bachelor,Life Sciences,Medium,Female,100,Very High,Senior Level,Sales Executive,Medium,Single,13194,17071,4,Yes,16,Excellent,Very High,80,No Stock Options,22,2,Good,1,0,0,0


In [30]:
train_df = employee_data.sample(1)
target = train_df["Attrition"]
train_df.drop(["Attrition"], axis=1, inplace=True)
train_dict = train_df.to_dict(orient="records")
print(train_dict[0])

{'Age': 27, 'BusinessTravel': 'Travel_Rarely', 'DailyRate': 1302, 'Department': 'Research & Development', 'DistanceFromHome': 19, 'Education': 'Bachelor', 'EducationField': 'Other', 'EnvironmentSatisfaction': 'Very High', 'Gender': 'Male', 'HourlyRate': 67, 'JobInvolvement': 'Medium', 'JobLevel': 'Entry Level', 'JobRole': 'Laboratory Technician', 'JobSatisfaction': 'Low', 'MaritalStatus': 'Divorced', 'MonthlyIncome': 4066, 'MonthlyRate': 16290, 'NumCompaniesWorked': 1, 'OverTime': 'No', 'PercentSalaryHike': 11, 'PerformanceRating': 'Excellent', 'RelationshipSatisfaction': 'Low', 'StandardHours': 80, 'StockOptionLevel': 'Medium Stock Options', 'TotalWorkingYears': 7, 'TrainingTimesLastYear': 3, 'WorkLifeBalance': 'Better', 'YearsAtCompany': 7, 'YearsInCurrentRole': 7, 'YearsSinceLastPromotion': 0, 'YearsWithCurrManager': 7}


In [31]:
employee_data["PerformanceRating"].unique()

array(['Excellent', 'Outstanding'], dtype=object)

In [23]:
from src.components.fieldsinfo_dataclass import FieldsInfo
info = FieldsInfo()

unique_values_dict = {}
for column in info.categorical_columns:
    unique_values = list(employee_data[column].unique())
    unique_values_dict[column] = unique_values
for column in info.ordinal_columns:
    unique_values = list(employee_data[column].unique())
    unique_values_dict[column] = unique_values
for column in info.numerical_columns:
    unique_values_dict[column] = "int"
unique_values_dict["NumCompaniesWorked"] = "int"
print(unique_values_dict)


{'BusinessTravel': ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], 'Department': ['Sales', 'Research & Development', 'Human Resources'], 'EducationField': ['Life Sciences', 'Other', 'Medical', 'Marketing', 'Technical Degree', 'Human Resources'], 'Gender': ['Female', 'Male'], 'JobRole': ['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director', 'Healthcare Representative', 'Manager', 'Sales Representative', 'Research Director', 'Human Resources'], 'MaritalStatus': ['Single', 'Married', 'Divorced'], 'OverTime': ['Yes', 'No'], 'Education': ['College', 'Below College', 'Master', 'Bachelor', 'Doctor'], 'EnvironmentSatisfaction': ['Medium', 'High', 'Very High', 'Low'], 'JobInvolvement': ['High', 'Medium', 'Very High', 'Low'], 'JobLevel': ['Junior Level', 'Entry Level', 'Mid Level', 'Senior Level', 'Executive Level'], 'JobSatisfaction': ['Very High', 'Medium', 'High', 'Low'], 'NumCompaniesWorked': 'int', 'PerformanceRating': ['Excellent', 'Outstanding']

Export the df to a csv file

In [7]:
# Assuming employee_data is your DataFrame
employee_data.to_csv('../data/refined_employee_data.csv', index=False)

In [None]:
# {
#     "BusinessTravel": ["Travel_Rarely", "Travel_Frequently", "Non-Travel"],
#     "Department": ["Sales", "Research & Development", "Human Resources"],
#     "EducationField": [
#         "Life Sciences",
#         "Other",
#         "Medical",
#         "Marketing",
#         "Technical Degree",
#         "Human Resources",
#     ],
#     "Gender": ["Female", "Male"],
#     "JobRole": [
#         "Sales Executive",
#         "Research Scientist",
#         "Laboratory Technician",
#         "Manufacturing Director",
#         "Healthcare Representative",
#         "Manager",
#         "Sales Representative",
#         "Research Director",
#         "Human Resources",
#     ],
#     "MaritalStatus": ["Single", "Married", "Divorced"],
#     "OverTime": ["Yes", "No"],
#     "Education": ["College", "Below College", "Master", "Bachelor", "Doctor"],
#     "EnvironmentSatisfaction": ["Medium", "High", "Very High", "Low"],
#     "JobInvolvement": ["High", "Medium", "Very High", "Low"],
#     "JobLevel": [
#         "Junior Level",
#         "Entry Level",
#         "Mid Level",
#         "Senior Level",
#         "Executive Level",
#     ],
#     "JobSatisfaction": ["Very High", "Medium", "High", "Low"],
#     "NumCompaniesWorked": "int",
#     "PerformanceRating": ["Low","Good","Excellent", "Outstanding"],
#     "RelationshipSatisfaction": ["Low", "Very High", "Medium", "High"],
#     "StockOptionLevel": [
#         "No Stock Options",
#         "Low Stock Options",
#         "High Stock Options",
#         "Medium Stock Options",
#     ],
#     "WorkLifeBalance": ["Bad", "Better", "Good", "Best"],
#     "Age": "int",
#     "DailyRate": "int",
#     "DistanceFromHome": "int",
#     "HourlyRate": "int",
#     "MonthlyIncome": "int",
#     "MonthlyRate": "int",
#     "PercentSalaryHike": "int",
#     "StandardHours": "int",
#     "TotalWorkingYears": "int",
#     "TrainingTimesLastYear": "int",
#     "YearsAtCompany": "int",
#     "YearsInCurrentRole": "int",
#     "YearsSinceLastPromotion": "int",
#     "YearsWithCurrManager": "int",
# }