In [None]:
import pandas as pd
df = pd.read_csv("EmployeeAttrition.csv")

In [None]:
df.shape

(1470, 35)

In [None]:
#print names of the columns
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

# Training Testing Split

- Target column is Attrition

- All other columns are input features / attributes / dimensions

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Attrition',axis=1) # all column which are input features --> X
Y = df.loc[:,'Attrition'] # output column  --> Y

In [None]:
X.shape+, Y.shape

((1470, 34), (1470,))

### Random_state -->

random no generation --> Psuedo Random Number Generator

generates random nos which never repeat

We want nos between fixed range

To make sure that every time we run train test split same rows go in training and same in testing, ... we set random_state



In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=7,
                                                    stratify=Y)#shuffle=True

In [44]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1029, 34), (441, 34), (1029,), (441,))

# Train the Preprocessing model

## Remove columns which are having missing values more than threshold

threshold = 0.9

means if a column has more tha 90% missing values then we will drop it

In [20]:
X_train.isna().sum(axis=0)

Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

In [12]:
# No columns to drop here

In [22]:
X_train.columns[X_train.nunique()==1]

Index(['EmployeeCount', 'Over18', 'StandardHours'], dtype='object')

In [45]:
X_train.drop(columns=X_train.columns[X_train.nunique()==1], inplace=True)

In [25]:
X_train.shape

(1029, 31)

In [46]:
valid_columns = X_train.columns

In [47]:
X_test=X_test.loc[:,valid_columns]

In [48]:
X_test.shape

(441, 31)

### Remove repeated rows

In [29]:
X_train.shape

(1029, 31)

In [49]:
X_train.drop_duplicates(inplace=True)

In [32]:
X_train.shape

(1029, 31)

In [None]:
# revise concept of label encoding and one hot encoding

In [33]:
# short-cut  --> apply one hot encoding (create dummy variables) on all string columns

In [34]:
X_train.dtypes

Age                          int64
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole  

In [50]:
X_train_ohe = pd.get_dummies(X_train)

In [51]:
X_train_ohe.shape

(1029, 52)

In [52]:
X_test_ohe = pd.get_dummies(X_test)

In [53]:
X_test_ohe.shape

(441, 52)

In [None]:
#X_train.fillna()

## Apply Standard Scaler on Training data and test data

- standard scaler
- min-max scaler
- robust scaler

In [54]:
cont_columns = X_train_ohe.columns[X_train_ohe.nunique() > 15]

In [55]:
cont_column_mean = X_train_ohe.loc[:,cont_columns].mean()
cont_column_std_dev = X_train_ohe.loc[:,cont_columns].std()

In [56]:
X_train_ohe.loc[:,cont_columns] = (X_train_ohe.loc[:,cont_columns] -cont_column_mean) / cont_column_std_dev

In [57]:
X_train_ohe.loc[:,cont_columns].mean()

Age                        2.123342e-16
DailyRate                 -1.381035e-16
DistanceFromHome          -5.524142e-17
EmployeeNumber             1.225669e-16
HourlyRate                -2.658493e-16
MonthlyIncome              6.214659e-17
MonthlyRate                1.277458e-16
TotalWorkingYears          1.035777e-16
YearsAtCompany             2.503127e-17
YearsInCurrentRole         5.006253e-17
YearsSinceLastPromotion    5.437827e-17
YearsWithCurrManager      -5.696771e-17
dtype: float64

In [None]:
#### test data apply std scaler

## Format the target Column

In [59]:
y_train

715      No
826      No
432      No
42      Yes
851      No
       ... 
588      No
8        No
880      No
1249    Yes
1171    Yes
Name: Attrition, Length: 1029, dtype: object

In [60]:
y_train[y_train =='Yes'] =1
y_train[y_train =='No'] =0

In [61]:
y_train

715     0
826     0
432     0
42      1
851     0
       ..
588     0
8       0
880     0
1249    1
1171    1
Name: Attrition, Length: 1029, dtype: object