# EDA

## Imports

In [1]:
import json
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import shap

from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
from imblearn.combine import SMOTEENN, SMOTETomek
# from imblearn.ensemble import BalancedRandomForestClassifier
# from imblearn.over_sampling import SMOTE

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

import xgboost as xgb

In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
pd.set_option('display.max_rows', 100)

set_config(transform_output = 'pandas')

## Data

In [4]:
## Read configured settings from JSON file

with open('../../config/Shared_Settings.json', 'r') as file:
    settings = json.load(file)
    
settings

{'source': '../../data/WA_Fn-UseC_-HR-Employee-Attrition.csv',
 'feature_types': {'feature_names_categorical': ['EducationField',
   'Gender',
   'JobRole',
   'OverTime'],
  'feature_names_continuous': ['Age',
   'DailyRate',
   'DistanceFromHome',
   'Education',
   'EnvironmentSatisfaction',
   'HourlyRate',
   'JobInvolvement',
   'JobLevel',
   'JobSatisfaction',
   'MonthlyIncome',
   'MonthlyRate',
   'NumCompaniesWorked',
   'PercentSalaryHike',
   'PerformanceRating',
   'RelationshipSatisfaction',
   'StockOptionLevel',
   'TotalWorkingYears',
   'TrainingTimesLastYear',
   'WorkLifeBalance',
   'YearsAtCompany',
   'YearsInCurrentRole',
   'YearsSinceLastPromotion',
   'YearsWithCurrManager'],
  'feature_names_ordinal': ['BusinessTravel']},
 'target_feature': ['Attrition']}

In [5]:
data = pd.read_csv(settings['source'])
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,3,Male,41,4,2,Laboratory Technician,4,Married,2571,12290,4,Y,No,17,3,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,Y,No,15,3,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,2,Male,87,4,2,Manufacturing Director,2,Married,6142,5174,1,Y,Yes,20,4,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,Y,No,14,3,4,80,0,17,3,2,9,6,0,8


In [6]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

# Target

In [7]:
target_feat = settings['target_feature'][0]
target_feat

'Attrition'

In [8]:
data[target_feat].value_counts(normalize=True).round(2)

Attrition
No    0.84
Yes   0.16
Name: proportion, dtype: float64

Review - `Attrition`

- Binary feature
    - Convert to boolean values
- Class imbalance favors the "No" category
    - Include balancing via SMOTE in pre-processing pipeline

In [10]:
data[target_feat] = data[target_feat].replace({'No':0, 'Yes':1})
data[target]

NameError: name 'target' is not defined

# Notes

---

**Target Composition**

> The target is a binary feature indicating whether an employee quit.
>
>
> The feature's classes are imbalanced in favor of the "No" class.
>   - This needs to be addressed as part of the modeling pipeline

---

# Categorical Features

In [None]:
(data[settings['feature_types']['feature_names_categorical']]
 .describe()
 .T
 .sort_values(by = ["unique", "freq"], ascending = False))

# JobRole

In [None]:
fig, ax = plt.subplots(figsize = (16,4))

sns.countplot(data = data, x = 'JobRole', hue = settings['target_feature'][0], ax=ax)

ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha='right');

# EducationField

In [None]:
# px.box(data_frame=data, x = 'BusinessTravel', y = 'Attrition',
#        category_orders = {'BusinessTravel': ['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']})

# OverTime

# Gender

# JobRole

# OverTime

## BusinessTravel

In [None]:
data['BusinessTravel'].unique()

In [None]:
cond = [data['BusinessTravel'] == 'Non-Travel',
        data['BusinessTravel'] == 'Travel_Rarely',
        data['BusinessTravel'] == 'Travel_Frequently']

choice = [0, 1, 2]

data['BusinessTravelXF'] = np.select(cond, choice, np.nan).astype(int)

In [None]:
data['BusinessTravelXF']

# Modeling

## Train-Test Split

In [None]:
data

In [None]:
X = data.drop(columns = [settings['target_feature'][0], 'EmployeeNumber', 'StandardHours', 'BusinessTravel'])
y = data[settings['target_feature'][0]]

In [None]:
X.head()

In [None]:
## Splitting - stratify to maintain class balance b/t X_train/_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, 
                                                    random_state=42, 
                                                    stratify=y)

In [None]:
# ## Specifying numeric columns for preprocessing
# num_cols = X_train.select_dtypes('number').columns.to_list()

# ## Specifying numeric columns for preprocessing
# cat_cols = X_train.select_dtypes(include='object').columns.to_list()

In [None]:
## Specifying data type columns for preprocessing
num_cols = settings['feature_types']['feature_names_continuous']

cat_cols = settings['feature_types']['feature_names_categorical']

# ord_cols = settings['feature_types']['feature_names_ordinal']

# ord_order = ['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']

In [None]:
## Creating ColumnTransformer and sub-transformers for imputation and encoding

### --- Creating column pipelines --- ###

cat_pipe = Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore',
                                                 sparse_output=False))])

num_pipe = Pipeline(steps=[('scaler', StandardScaler())])

# ord_pipe = Pipeline(steps=[('ordenc',
#                             OrdinalEncoder(categories=[ord_order],
#                                            handle_unknown='use_encoded_value',
#                                            unknown_value=-1,
#                                            dtype=int))])

# ### --- Instantiating the ColumnTransformer --- ###
# preprocessor = ColumnTransformer(
#                     transformers=[
#                         ('num', num_pipe, num_cols),
#                         ('cat', cat_pipe, cat_cols),
#                         ('ord', ord_pipe, ord_cols)])

### --- Instantiating the ColumnTransformer --- ###
preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', num_pipe, num_cols),
                        ('cat', cat_pipe, cat_cols)])

preprocessor

In [None]:
## Fitting feature preprocessor
preprocessor.fit(X_train)

X_train_df = preprocessor.transform(X_train)

X_test_df = preprocessor.transform(X_test)

In [None]:
## Handle the target feature's class imbalance via SMOTE with Tomek Links
smt = SMOTETomek(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train_df, X_test_df)

In [None]:
# # Create a pipeline with SMOTE, preprocessor, and classifier
# model_pipeline = Pipeline([
#                     ('resampler', SMOTETomek(sampling_strategy='auto', random_state=42)),
#                     ('preprocessor', preprocessor),
#                     ('classifier', LogisticRegression())
#                     ])

# model_pipeline

In [None]:
# ## Transform via the ColumnTransformer preprocessor and create new dataframe

# preprocessor.fit(X_train)

# X_train_df = pd.DataFrame(preprocessor.transform(X_train),
#                              columns=final_cols, index=X_train.index)

# X_test_tf_df = pd.DataFrame(preprocessor.transform(X_test),
#                             columns=final_cols, index=X_test.index)

# display(X_train_df.head(5),X_test_tf_df.head(5))

In [None]:
# Fit the pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model_pipeline.predict(X_test)