In [1]:
import numpy as np
import pandas as pd

In [2]:
d_f = pd.read_csv('Book-01.csv')

In [3]:
d_f.columns

Index(['Attrition', 'Business Travel', 'CF_age band', 'CF_attrition label',
       'Department', 'Education Field', 'Employee Number', 'Gender',
       'Job Role', 'Marital Status', 'Over Time', 'Training Times Last Year',
       'Age', 'CF_attrition count', 'CF_attrition rate', 'CF_current Employee',
       'Daily Rate', 'Distance From Home', 'Education',
       'Environment Satisfaction', 'Hourly Rate', 'Job Involvement',
       'Job Level', 'Job Satisfaction', 'Monthly Income', 'Monthly Rate',
       'Num Companies Worked', 'Percent Salary Hike', 'Performance Rating',
       'Relationship Satisfaction', 'Standard Hours', 'Stock Option Level',
       'Total Working Years', 'Work Life Balance', 'Years At Company',
       'Years In Current Role', 'Years Since Last Promotion',
       'Years With Curr Manager'],
      dtype='object')

In [4]:
from sklearn.preprocessing import StandardScaler , OneHotEncoder , OrdinalEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.compose import ColumnTransformer

In [7]:
d_f['Attrition'].value_counts()

Attrition
No     1233
Yes     237
Name: count, dtype: int64

In [8]:
d_f['Business Travel'].value_counts()

Business Travel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: count, dtype: int64

In [9]:
d_f['CF_age band'].value_counts()

CF_age band
25 - 34     554
35 - 44     505
45 - 54     245
Under 25     97
Over 55      69
Name: count, dtype: int64

In [10]:
d_f['Department'].value_counts()

Department
R&D      961
Sales    446
HR        63
Name: count, dtype: int64

In [11]:
d_f['Education Field'].value_counts()

Education Field
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: count, dtype: int64

In [12]:
d_f['Gender'].value_counts()

Gender
Male      882
Female    588
Name: count, dtype: int64

In [13]:
d_f['Job Role'].value_counts()

Job Role
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: count, dtype: int64

In [14]:
d_f['Job Level'].value_counts()

Job Level
1    543
2    534
3    218
4    106
5     69
Name: count, dtype: int64

In [15]:
clx = ['Age','Distance From Home','Monthly Income','Years At Company','Job Role','Gender','Education Field','Department','CF_age band','Business Travel','Job Level','Attrition']

In [16]:
df = d_f[clx]

In [17]:
df.head(1)

Unnamed: 0,Age,Distance From Home,Monthly Income,Years At Company,Job Role,Gender,Education Field,Department,CF_age band,Business Travel,Job Level,Attrition
0,41,1,5993,6,Sales Executive,Female,Life Sciences,Sales,35 - 44,Travel_Rarely,2,Yes


In [18]:
col_tx = ColumnTransformer (
transformers=[
("Scaling",StandardScaler(),['Age','Distance From Home','Monthly Income','Years At Company','Job Level']),
("Encoding",OneHotEncoder(drop='first'),['Education Field','Department','Gender','Job Role','Business Travel','Attrition']),
("Ordinal Encoding", OrdinalEncoder(categories=[['Under 25', '25 - 34', '35 - 44', '45 - 54', 'Over 55']]), ['CF_age band'])
    
],
remainder='passthrough'
)

In [19]:
col_tx

In [20]:
df.head(2)

Unnamed: 0,Age,Distance From Home,Monthly Income,Years At Company,Job Role,Gender,Education Field,Department,CF_age band,Business Travel,Job Level,Attrition
0,41,1,5993,6,Sales Executive,Female,Life Sciences,Sales,35 - 44,Travel_Rarely,2,Yes
1,49,8,5130,10,Research Scientist,Male,Life Sciences,R&D,45 - 54,Travel_Frequently,2,No


In [21]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [23]:
df_ct = col_tx.fit_transform(df)

In [24]:
df_ct

array([[ 0.4463504 , -1.01090934, -0.10834951, ...,  1.        ,
         1.        ,  2.        ],
       [ 1.32236521, -0.14714972, -0.29171859, ...,  0.        ,
         0.        ,  3.        ],
       [ 0.008343  , -0.88751511, -0.93765369, ...,  1.        ,
         1.        ,  2.        ],
       ...,
       [-0.2106607 ,  0.34642721, -0.40836937, ...,  0.        ,
         1.        ,  2.        ],
       [-1.63418477, -0.27054395, -0.85648685, ...,  1.        ,
         1.        ,  0.        ],
       [-1.52468292, -0.27054395, -0.89834514, ...,  1.        ,
         1.        ,  0.        ]])

In [25]:
df_ct.shape

(1470, 25)

In [26]:
d__f = pd.DataFrame(df_ct)

In [27]:
d__f.shape

(1470, 25)