# Understanding the dataset

Import the dataset in a notebook environment 

In [36]:
import pandas as pd

# read csv file
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

Show the number of attributes (columns) and number of records (rows)

In [37]:
df.shape

(1470, 35)

Show the statistics of the dataset ( column wise mean, standard deviation, max,min etc)

In [38]:

df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


Count the number of missing values in the dataset

In [39]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

Count the number of duplicate values in the dataset

In [40]:
df.duplicated().sum()

0

# Creation of input and output features

In [41]:
Features = df.drop('Attrition', axis=1)
Labels= df['Attrition']

# Conversion of Labels into numeric values

In [42]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Labels = encoder.fit_transform(Labels)
Labels

array([1, 0, 1, ..., 0, 0, 0])

# Conversion of Features into numeric values

In [43]:
Features.dtypes


Age                          int64
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYea

In [44]:
# List of columns to convert to categorical
categorical_columns = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 'PerformanceRating',  'RelationshipSatisfaction','WorkLifeBalance']

for col in Features.columns:
    if Features[col].dtype != 'int64':
        categorical_columns.append(col)


# # Convert each column to categorical type
for col in categorical_columns:
    Features[col] = Features[col].astype('category')



In [45]:
Features.dtypes

Age                            int64
BusinessTravel              category
DailyRate                      int64
Department                  category
DistanceFromHome               int64
Education                   category
EducationField              category
EmployeeCount                  int64
EmployeeNumber                 int64
EnvironmentSatisfaction     category
Gender                      category
HourlyRate                     int64
JobInvolvement              category
JobLevel                       int64
JobRole                     category
JobSatisfaction             category
MaritalStatus               category
MonthlyIncome                  int64
MonthlyRate                    int64
NumCompaniesWorked             int64
Over18                      category
OverTime                    category
PercentSalaryHike              int64
PerformanceRating           category
RelationshipSatisfaction    category
StandardHours                  int64
StockOptionLevel               int64
T

In [46]:
filter_col=list(Features.select_dtypes(exclude=['category']).columns)
filter_col

['Age',
 'DailyRate',
 'DistanceFromHome',
 'EmployeeCount',
 'EmployeeNumber',
 'HourlyRate',
 'JobLevel',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

# One-hot encode the data using pandas get_dummies

In [47]:
Features = pd.get_dummies(Features)
      

In [48]:
Features.dtypes


Age                           int64
DailyRate                     int64
DistanceFromHome              int64
EmployeeCount                 int64
EmployeeNumber                int64
                              ...  
RelationshipSatisfaction_4    uint8
WorkLifeBalance_1             uint8
WorkLifeBalance_2             uint8
WorkLifeBalance_3             uint8
WorkLifeBalance_4             uint8
Length: 75, dtype: object

# Scaling of the features


In [49]:
def scaling(data):
    # filter_col=list(Features.select_dtypes(exclude=['category']).columns)
    if(data==1):
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        for col in filter_col:
            Features[col] = scaler.fit_transform(Features[[col]])
        return Features
    else:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        # return scaler.fit_transform(Features[filter_col])
        for col in filter_col:
            Features[col] = scaler.fit_transform(Features[[col]])
        return Features
    

In [50]:
# Features = scaling(0)
Features = scaling(1)

Features

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeCount,EmployeeNumber,HourlyRate,JobLevel,MonthlyIncome,MonthlyRate,NumCompaniesWorked,...,PerformanceRating_3,PerformanceRating_4,RelationshipSatisfaction_1,RelationshipSatisfaction_2,RelationshipSatisfaction_3,RelationshipSatisfaction_4,WorkLifeBalance_1,WorkLifeBalance_2,WorkLifeBalance_3,WorkLifeBalance_4
0,0.547619,0.715820,0.000000,0.0,0.000000,0.914286,0.25,0.262454,0.698053,0.888889,...,1,0,1,0,0,0,1,0,0,0
1,0.738095,0.126700,0.250000,0.0,0.000484,0.442857,0.25,0.217009,0.916001,0.111111,...,0,1,0,0,0,1,0,0,1,0
2,0.452381,0.909807,0.035714,0.0,0.001451,0.885714,0.00,0.056925,0.012126,0.666667,...,1,0,0,1,0,0,0,0,1,0
3,0.357143,0.923407,0.071429,0.0,0.001935,0.371429,0.00,0.100053,0.845814,0.111111,...,1,0,0,0,1,0,0,0,1,0
4,0.214286,0.350036,0.035714,0.0,0.002903,0.142857,0.00,0.129489,0.583738,1.000000,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0.428571,0.559771,0.785714,0.0,0.996613,0.157143,0.25,0.082254,0.409396,0.444444,...,1,0,0,0,1,0,0,0,1,0
1466,0.500000,0.365784,0.178571,0.0,0.997097,0.171429,0.50,0.472986,0.777474,0.444444,...,1,0,1,0,0,0,0,0,1,0
1467,0.214286,0.037938,0.107143,0.0,0.998065,0.814286,0.25,0.270300,0.123670,0.111111,...,0,1,0,1,0,0,0,0,1,0
1468,0.738095,0.659270,0.035714,0.0,0.998549,0.471429,0.25,0.230700,0.447661,0.222222,...,1,0,0,0,0,1,0,1,0,0


# Correlation Analysis

In [51]:
features_df = pd.DataFrame(Features, columns=Features.columns) # scaled feature dataframe
labels_df = pd.DataFrame(Labels, columns=['Attrition']) # label dataframe

In [52]:
labels_series = labels_df['Attrition'] 
correlations = features_df.corrwith(labels_series)
correlations = correlations.abs().sort_values(ascending=False).head(20)
correlations

OverTime_Yes                        0.246118
OverTime_No                         0.246118
MaritalStatus_Single                0.175419
TotalWorkingYears                   0.171063
JobLevel                            0.169105
YearsInCurrentRole                  0.160545
MonthlyIncome                       0.159840
Age                                 0.159205
JobRole_Sales Representative        0.157234
YearsWithCurrManager                0.156199
StockOptionLevel                    0.137145
YearsAtCompany                      0.134392
EnvironmentSatisfaction_1           0.122819
JobInvolvement_1                    0.117161
BusinessTravel_Travel_Frequently    0.115143
WorkLifeBalance_1                   0.098689
JobRole_Laboratory Technician       0.098290
MaritalStatus_Married               0.090984
JobSatisfaction_1                   0.090329
JobRole_Research Director           0.088870
dtype: float64

# Bonus Task(Validating the Pipeline)

In [56]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# X=Features
# y=Labels

# take top 20 correlated features as X 
X = features_df[correlations.index]
y = labels_df


print(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy


      OverTime_Yes  OverTime_No  MaritalStatus_Single  TotalWorkingYears  \
0                1            0                     1              0.200   
1                0            1                     0              0.250   
2                1            0                     1              0.175   
3                1            0                     0              0.200   
4                0            1                     0              0.150   
...            ...          ...                   ...                ...   
1465             0            1                     0              0.425   
1466             0            1                     0              0.225   
1467             1            0                     0              0.150   
1468             0            1                     0              0.425   
1469             0            1                     0              0.150   

      JobLevel  YearsInCurrentRole  MonthlyIncome       Age  \
0         0.25          

  y = column_or_1d(y, warn=True)


0.8605442176870748