## About this project
This is classification project, in this project my main aim was to practice `Feature Engineering` and during this I use following encoder `One-Hot Encoder`,`Ordinal Encoder` and `MinMaxScaler`

In [37]:
import pandas as pd
import numpy as np

## About Dataset
In this dataset, `Education` is ordinal type of categorical data, so apply ORDINAL ENCODER and `City,Gender and EverBenched` are nominal type of categorical data, so apply ONE-HOT ENCODER and after this whole dataset will be in numeric form , then finally MIN-MAX ENCODER will be applied.

In [5]:
df = pd.read_csv("Datasets/Employee leave_or_not.csv")
df.sample(4)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
3074,Bachelors,2015,Pune,2,34,Female,No,5,1
2570,Bachelors,2013,Bangalore,3,28,Male,No,5,0
3918,Bachelors,2013,Bangalore,3,32,Male,No,1,0
2253,Masters,2018,New Delhi,3,29,Male,No,2,1


In [3]:
df.shape

(4653, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [6]:
df.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,4653.0,4653.0,4653.0,4653.0,4653.0
mean,2015.06297,2.698259,29.393295,2.905652,0.343864
std,1.863377,0.561435,4.826087,1.55824,0.475047
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


### Here, I rename column to make it smaller

In [9]:
df.rename(columns={"ExperienceInCurrentDomain":"Experience"},inplace=True)

In [11]:
df.Education.value_counts()

Education
Bachelors    3601
Masters       873
PHD           179
Name: count, dtype: int64

In [13]:
df.City.value_counts()

City
Bangalore    2228
Pune         1268
New Delhi    1157
Name: count, dtype: int64

In [14]:
df.Gender.value_counts()

Gender
Male      2778
Female    1875
Name: count, dtype: int64

In [15]:
df.EverBenched.value_counts()

EverBenched
No     4175
Yes     478
Name: count, dtype: int64

### Always split data into training and testing, before feature engineering

In [19]:
from sklearn.model_selection import train_test_split

x = df.drop("LeaveOrNot",axis=1)
y = df["LeaveOrNot"]
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.20, random_state=0)
x_train.shape, x_test.shape

((3722, 8), (931, 8))

In [20]:
x_train.head(3

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,Experience
2980,Bachelors,2018,New Delhi,3,30,Female,Yes,3
2527,Bachelors,2012,Bangalore,3,27,Female,No,5
2868,Bachelors,2012,Pune,2,27,Female,No,5


### Working on Ordinal Encoder 

In [38]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=[["Bachelors","Masters","PHD"]], dtype=np.int32)
ordinal_encoder.fit(x_train.iloc[:,0:1])

education_train = ordinal_encoder.transform(x_train.iloc[:,0:1])
education_test = ordinal_encoder.transform(x_test.iloc[:,0:1])

In [40]:
education_train_df = pd.DataFrame(education_train, columns=["Education"])
education_test_df = pd.DataFrame(education_test, columns=["Education"])
# education_train_df.sample(20)

### Working on One-Hot Encoder

In [47]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False,dtype=np.int32,drop="first")

encoder.fit(x_train[["City","Gender","EverBenched"]])
other_train = encoder.transform(x_train[["City","Gender","EverBenched"]])
other_test = encoder.transform(x_test[["City","Gender","EverBenched"]])

In [50]:
other_train_df = pd.DataFrame(other_train, columns=["City_Pune","City_New Delhi","Gender_Female","EverBenched_No"])
other_test_df = pd.DataFrame(other_test, columns=["City_Pune","City_New Delhi","Gender_Female","EverBenched_No"])

In [51]:
other_train_df

Unnamed: 0,City_Pune,City_New Delhi,Gender_Female,EverBenched_No
0,1,0,0,1
1,0,0,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,0,0
...,...,...,...,...
3717,0,0,1,0
3718,1,0,0,0
3719,1,0,0,0
3720,0,1,1,0


In [59]:
rem_train = x_train[["JoiningYear","PaymentTier","Age"]]
rem_test = x_test[["JoiningYear","PaymentTier","Age"]]

### Now merging all these datasets into single form and here `important work` is to reset index

In [67]:
education_train_df = education_train_df.reset_index(drop=True)
other_train_df = other_train_df.reset_index(drop=True)
rem_train = rem_train.reset_index(drop=True)

education_test_df = education_test_df.reset_index(drop=True)
other_test_df = other_test_df.reset_index(drop=True)
rem_test = rem_test.reset_index(drop=True)

x_train_new = pd.concat([education_train_df, other_train_df, rem_train], axis=1)
x_test_new = pd.concat([education_test_df, other_test_df, rem_test], axis=1)

In [63]:
x_train.shape, education_train_df.shape, other_train.shape, rem_train.shape

((3722, 8), (3722, 1), (3722, 4), (3722, 3))

In [68]:
x_train_new.shape, x_test_new.shape

((3722, 8), (931, 8))

In [69]:
x_train_new

Unnamed: 0,Education,City_Pune,City_New Delhi,Gender_Female,EverBenched_No,JoiningYear,PaymentTier,Age
0,0,1,0,0,1,2018,3,30
1,0,0,0,0,0,2012,3,27
2,0,0,1,0,0,2012,2,27
3,0,0,1,0,0,2015,3,24
4,0,0,0,0,0,2017,3,24
...,...,...,...,...,...,...,...,...
3717,0,0,0,1,0,2016,3,28
3718,1,1,0,0,0,2016,3,33
3719,0,1,0,0,0,2017,3,26
3720,0,0,1,1,0,2018,3,27


### Working on MinMaxScaler

In [86]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train_new)

x_train_scaler = scaler.transform(x_train_new)
x_test_scaler = scaler.transform(x_test_new)

In [78]:
x_train_scaler_df = pd.DataFrame(x_train_scaler, columns=x_train_new.columns)
x_test_scaler_df = pd.DataFrame(x_test_scaler, columns=x_train_new.columns)

In [72]:
round(x_train_new.describe(), 1)

Unnamed: 0,Education,City_Pune,City_New Delhi,Gender_Female,EverBenched_No,JoiningYear,PaymentTier,Age
count,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0
mean,0.3,0.2,0.3,0.6,0.1,2015.1,2.7,29.4
std,0.5,0.4,0.4,0.5,0.3,1.9,0.6,4.8
min,0.0,0.0,0.0,0.0,0.0,2012.0,1.0,22.0
25%,0.0,0.0,0.0,0.0,0.0,2013.0,3.0,26.0
50%,0.0,0.0,0.0,1.0,0.0,2015.0,3.0,28.0
75%,0.0,0.0,1.0,1.0,0.0,2017.0,3.0,32.0
max,2.0,1.0,1.0,1.0,1.0,2018.0,3.0,41.0


In [79]:
round(x_train_scaler_df.describe(), 1)

Unnamed: 0,Education,City_Pune,City_New Delhi,Gender_Female,EverBenched_No,JoiningYear,PaymentTier,Age
count,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0
mean,0.1,0.2,0.3,0.6,0.1,0.5,0.9,0.4
std,0.3,0.4,0.4,0.5,0.3,0.3,0.3,0.3
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.2,1.0,0.2
50%,0.0,0.0,0.0,1.0,0.0,0.5,1.0,0.3
75%,0.0,0.0,1.0,1.0,0.0,0.8,1.0,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [81]:
x_train_scaler_df

Unnamed: 0,Education,City_Pune,City_New Delhi,Gender_Female,EverBenched_No,JoiningYear,PaymentTier,Age
0,0.0,1.0,0.0,0.0,1.0,1.000000,1.0,0.421053
1,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,0.263158
2,0.0,0.0,1.0,0.0,0.0,0.000000,0.5,0.263158
3,0.0,0.0,1.0,0.0,0.0,0.500000,1.0,0.105263
4,0.0,0.0,0.0,0.0,0.0,0.833333,1.0,0.105263
...,...,...,...,...,...,...,...,...
3717,0.0,0.0,0.0,1.0,0.0,0.666667,1.0,0.315789
3718,0.5,1.0,0.0,0.0,0.0,0.666667,1.0,0.578947
3719,0.0,1.0,0.0,0.0,0.0,0.833333,1.0,0.210526
3720,0.0,0.0,1.0,1.0,0.0,1.000000,1.0,0.263158


### Finally working on model training

In [84]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr_scaled = LogisticRegression()

lr.fit(x_train_new, y_train)
lr_scaled.fit(x_train_scaler_df, y_train)

pred = lr.predict(x_test_new)
pred_scaled = lr_scaled.predict(x_test_scaler_df)

In [85]:
from sklearn.metrics import accuracy_score

print(accuracy_score(pred, y_test))
print(accuracy_score(pred_scaled, y_test))

0.6960257787325457
0.7175080558539205


#### Conclusion:
After scaling dataframe, model accuracy is improved