# Feature Classifications

Goal: to identify which features are continuous, categorical, and ordinal, then save the results to the settings file.

Purpose: Split the EDA process by category to speed 

# Importing Packages and Reading Data

In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
## Read configured settings from JSON file

with open('../../config/Shared_Settings.json', 'r') as file:
    settings = json.load(file)
settings

{'source': '../../data/WA_Fn-UseC_-HR-Employee-Attrition.csv',
 'feature_types': {'feature_names_categorical': ['EducationField',
   'Gender',
   'JobRole',
   'OverTime'],
  'feature_names_continuous': ['Age',
   'DailyRate',
   'DistanceFromHome',
   'Education',
   'EnvironmentSatisfaction',
   'HourlyRate',
   'JobInvolvement',
   'JobLevel',
   'JobSatisfaction',
   'MonthlyIncome',
   'MonthlyRate',
   'NumCompaniesWorked',
   'PercentSalaryHike',
   'PerformanceRating',
   'RelationshipSatisfaction',
   'StockOptionLevel',
   'TotalWorkingYears',
   'TrainingTimesLastYear',
   'WorkLifeBalance',
   'YearsAtCompany',
   'YearsInCurrentRole',
   'YearsSinceLastPromotion',
   'YearsWithCurrManager'],
  'feature_names_ordinal': ['BusinessTravel']},
 'target_feature': ['Attrition']}

In [3]:
data = pd.read_csv(settings['source'])
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

# Setting Target Feature

This dataset uses the `Attrition` feature as the target (based on prior knowledge).

In [5]:
target_feature = 'Attrition'
target_feature

'Attrition'

In [6]:
data['Attrition'].describe()

count     1470
unique       2
top         No
freq      1233
Name: Attrition, dtype: object

In [7]:
data['Attrition'].value_counts(normalize = True).round(2)

Attrition
No     0.84
Yes    0.16
Name: proportion, dtype: float64

In [8]:
## Update settings dictionary
settings['target_feature'] = [target_feature]
display(settings['target_feature'])


# Write the updated settings back to the file
with open('../../config/Shared_Settings.json', 'w') as file:
    json.dump(settings, file, indent=4)

['Attrition']

The `Attrition` feature is binary (yes/no), and is imbalanced in favor of the "No" class. This makes sense as "No" indicates that the employee is still active.

As part of the pre-modeling processing, I will address this class imbalance to increase the model's accuracy.

# Select Numeric Features

In [9]:
## Select numeric features and create separate dataframe

numeric_df = data.select_dtypes(include = "number")
numeric_df

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,3,80,1,17,3,3,5,2,0,3
1466,39,613,6,1,1,2062,4,42,2,3,...,1,80,1,9,5,3,7,7,1,7
1467,27,155,4,3,1,2064,2,87,4,2,...,2,80,1,6,0,3,6,2,0,3
1468,49,1023,2,3,1,2065,4,63,2,2,...,4,80,0,17,3,2,9,6,0,8


In [10]:
## Review summary stats
numeric_df.describe(include = "number").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


In [11]:
## Inspect total unique values per feature to identify any constant features

numeric_df.nunique().sort_values()

StandardHours                  1
EmployeeCount                  1
PerformanceRating              2
RelationshipSatisfaction       4
EnvironmentSatisfaction        4
WorkLifeBalance                4
JobInvolvement                 4
StockOptionLevel               4
JobSatisfaction                4
JobLevel                       5
Education                      5
TrainingTimesLastYear          7
NumCompaniesWorked            10
PercentSalaryHike             15
YearsSinceLastPromotion       16
YearsWithCurrManager          18
YearsInCurrentRole            19
DistanceFromHome              29
YearsAtCompany                37
TotalWorkingYears             40
Age                           43
HourlyRate                    71
DailyRate                    886
MonthlyIncome               1349
MonthlyRate                 1427
EmployeeNumber              1470
dtype: int64

## Drop Non-Informative Features

Including constants and uninformative details such as the employee number.

In [12]:
constant_feats = (numeric_df
                  .nunique()
                  [numeric_df.nunique() == 1]
                  .index)
constant_feats

Index(['EmployeeCount', 'StandardHours'], dtype='object')

In [13]:
uninformative_feats = ['EmployeeNumber']
uninformative_feats

['EmployeeNumber']

In [14]:
numeric_df = numeric_df.drop(columns=[*constant_feats, *uninformative_feats])
numeric_df

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,2,94,3,2,4,5993,...,3,1,0,8,0,1,6,4,0,5
1,49,279,8,1,3,61,2,2,2,5130,...,4,4,1,10,3,3,10,7,1,7
2,37,1373,2,2,4,92,2,1,3,2090,...,3,2,0,7,3,3,0,0,0,0
3,33,1392,3,4,4,56,3,1,3,2909,...,3,3,0,8,3,3,8,7,3,0
4,27,591,2,1,1,40,3,1,2,3468,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,3,41,4,2,4,2571,...,3,3,1,17,3,3,5,2,0,3
1466,39,613,6,1,4,42,2,3,1,9991,...,3,1,1,9,5,3,7,7,1,7
1467,27,155,4,3,2,87,4,2,2,6142,...,4,2,1,6,0,3,6,2,0,3
1468,49,1023,2,3,4,63,2,2,2,5390,...,3,4,0,17,3,2,9,6,0,8


#### Save Features - Continuous

In [15]:
# Update ordinal feature names setting
settings['feature_types']['feature_names_continuous'] = numeric_df.columns.to_list()
display(settings['feature_types']['feature_names_continuous'])

# Write the updated settings back to the file
with open('../../config/Shared_Settings.json', 'w') as file:
    json.dump(settings, file, indent=4)

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

# Select Non-Numeric Features

In [16]:
non_num_df = data.select_dtypes(exclude = "number")
non_num_df

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
...,...,...,...,...,...,...,...,...,...
1465,No,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No
1466,No,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,Y,No
1467,No,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Y,Yes
1468,No,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,Y,No


## Drop Constant Features

In [17]:
## Identify features containing only one unique value

constant_feats = (non_num_df
                  .nunique()
                  [non_num_df.nunique() == 1]
                  .index)
constant_feats

Index(['Over18'], dtype='object')

In [18]:
## Drop constant features

non_num_df = non_num_df.drop(columns=constant_feats)
non_num_df

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No
...,...,...,...,...,...,...,...,...
1465,No,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married,No
1466,No,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No
1467,No,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married,Yes
1468,No,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married,No


In [19]:
non_num_df.nunique().sort_values()

Attrition         2
Gender            2
OverTime          2
BusinessTravel    3
Department        3
MaritalStatus     3
EducationField    6
JobRole           9
dtype: int64

## Investigate Possible Ordinal Features

In [20]:
## Slice out feature names with total unique values between 1 and 5

ordinal_cols = (non_num_df.nunique()                                        ## Calculate number of unique features
                .sort_values()                                              ## Sort by ascending values
                [(non_num_df.nunique() > 2) & (non_num_df.nunique() <= 5)]  ## Total unique features between 2 and 5
                .index
                .to_list())                                                 ## Convert to list
ordinal_cols

['BusinessTravel', 'Department', 'MaritalStatus']

In [21]:
## Review values for categorical numeric features
non_num_df.loc[:, ordinal_cols]

Unnamed: 0,BusinessTravel,Department,MaritalStatus
0,Travel_Rarely,Sales,Single
1,Travel_Frequently,Research & Development,Married
2,Travel_Rarely,Research & Development,Single
3,Travel_Frequently,Research & Development,Married
4,Travel_Rarely,Research & Development,Married
...,...,...,...
1465,Travel_Frequently,Research & Development,Married
1466,Travel_Rarely,Research & Development,Married
1467,Travel_Rarely,Research & Development,Married
1468,Travel_Frequently,Sales,Married


In [22]:
## Loop through numeric categorical features to print unique values in each feature

for col in ordinal_cols:
    print(col, ": \t", sorted(non_num_df.loc[:, col].unique()))

BusinessTravel : 	 ['Non-Travel', 'Travel_Frequently', 'Travel_Rarely']
Department : 	 ['Human Resources', 'Research & Development', 'Sales']
MaritalStatus : 	 ['Divorced', 'Married', 'Single']


In [23]:
## Generate dictionary of ordinal column names and the order of their values

settings['feature_types']['feature_names_ordinal'] = ['BusinessTravel']

settings['feature_types']['feature_names_ordinal']

['BusinessTravel']

## Save Features - Ordinal

In [24]:
# Write the updated settings back to the file
with open('../../config/Shared_Settings.json', 'w') as file:
    json.dump(settings, file, indent=4)

## Save Features - Categorical

In [25]:
## Set Categorical Feature Names Setting in JSON File

## Create list of categorical features and remove target feature
cat_cols = (non_num_df
            .drop(columns= [target_feature, *ordinal_cols])
            .columns
            .to_list()
            )

## Update settings dictionary
settings['feature_types']['feature_names_categorical'] = cat_cols
display(settings['feature_types']['feature_names_categorical'])

# Write the updated settings back to the file
with open('../../config/Shared_Settings.json', 'w') as file:
    json.dump(settings, file, indent=4)

['EducationField', 'Gender', 'JobRole', 'OverTime']